def _get_db_path(database, collection): if collection is None: # TODO: default to WORK_DIR config? return (None, Simstring.DEFAULT_UNICODE) else: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() try: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() for entry in norm_conf: # TODO THIS IS WRONG dbname, dbpath, dbunicode = entry[0], entry[3], entry[4] if dbname == database: return (dbpath, dbunicode) # not found in config. Messager.warning('DB ' + database + ' not defined in config for ' + collection + ', falling back on default.') return (None, Simstring.DEFAULT_UNICODE) except Exception: # whatever goes wrong, just warn and fall back on the default. Messager.warning('Failed to get DB path from config for ' + collection + ', falling back on default.') return (None, Simstring.DEFAULT_UNICODE)
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from httplib import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: data = ann_obj.get_document_text().encode('utf-8') req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request( 'POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(url_soup.path), data, headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally:
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from httplib import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ( '?' + url_soup.query if url_soup.query else '') try: data = ann_obj.get_document_text().encode('utf-8') req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request('POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(url_soup.path), data, headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError(tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally:
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: conn.request( 'POST', url_soup.path, # The document text as body ann_obj.get_document_text().encode('utf8'), headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) finally:
def getOcrFileTypeDetails(collection, document): path = path_join(real_directory(collection), 'qafunnelocrfiledetails') returnMap = {"status": False} for line in fileinput.input(path, inplace=False): loaded_r = json.loads(line) docname = str(loaded_r['document']) if docname == document: returnMap = { "status": True, "document": document, "ocrOutputResult": loaded_r['ocrOutputResult'], "identificationOutputResult": loaded_r['identificationOutputResult'], "identificationBoundaryOutputResult": loaded_r['identificationBoundaryOutputResult'], "extractionOutputResult": loaded_r['extractionOutputResult'], "fileType": loaded_r['fileType'], "lossType": loaded_r['lossType'], "comments": loaded_r['comments'] } #fileinput.close() return returnMap
def create_comment(collection, document, id, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) document = path_join(real_dir, document) projectconf = ProjectConfiguration(real_dir) txt_file_path = document + '.' + TEXT_FILE_SUFFIX # XXX what is this doing here? # path_split(document)[0] with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() _set_special_comments(ann_obj, id, comment, mods, undo_resp=undo_resp) mods_json = mods.json_response() if undo_resp: mods_json['undo'] = json_dumps(undo_resp) mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def download_collection(collection, exclude_configs=False): directory = collection real_dir = real_directory(directory) dir_name = basename(dirname(real_dir)) fname = '%s.%s' % (dir_name, 'tar.gz') tmp_file_path = None try: tmp_file_fh, tmp_file_path = mkstemp() os_close(tmp_file_fh) tar_cmd_split = ['tar', '--exclude=.stats_cache'] if exclude_configs: tar_cmd_split.extend(['--exclude=annotation.conf', '--exclude=visual.conf', '--exclude=tools.conf', '--exclude=kb_shortcuts.conf']) tar_cmd_split.extend(['-c', '-z', '-f', tmp_file_path, dir_name]) tar_p = Popen(tar_cmd_split, cwd=path_join(real_dir, '..')) tar_p.wait() hdrs = [('Content-Type', 'application/octet-stream'), #'application/x-tgz'), ('Content-Disposition', 'inline; filename=%s' % fname)] with open(tmp_file_path, 'rb') as tmp_file: tar_data = tmp_file.read() raise NoPrintJSONError(hdrs, tar_data) finally: if tmp_file_path is not None: remove(tmp_file_path)
def input_text(path,doc,_id,text,start_list, current_list=None): try: from os.path import join as path_join from document import real_directory real_dir = real_directory(path) except: real_dir=path proj = ProjectConfiguration(real_dir) txt_lvl = TextAnnotations(proj) answerlist = txt_lvl.startlists[start_list].start with getAnnObject(path, doc) as ann: ann_txtLvls = ann.get_textLevels() annotation = None for i in ann_txtLvls: if i.type == answerlist.name: annotation = i if annotation: txt_lvl.set_ann(ann_txtLvls) else: ann_id = ann.get_new_id('F') ann.add_annotation(TextLevelAnnotation(ann_id, answerlist.name,[])) annotation = ann.get_ann_by_id(ann_id) #~ ann_txtLvls = ann.get_textLevels() #~ if annotation.tail: #~ annotation.tail += ";" txt_lvl.startlists[start_list].currentList.set_input(text) txt_lvl.startlists[start_list].currentList = 'stop' if not current_list in txt_lvl.startlists[start_list].followed_path: txt_lvl.startlists[start_list].followed_path.append(current_list) update_annotations(ann,annotation, txt_lvl.startlists[start_list]) #~ annotation.tail += text #~ annotation.ids.append(_id) return {'stop':True, 'annotation':str(annotation),}
def possible_arc_types(collection, origin_type, target_type): directory = collection real_dir = real_directory(directory) projectconf = ProjectConfiguration(real_dir) response = {} try: possible = projectconf.arc_types_from_to(origin_type, target_type) # TODO: proper error handling if possible is None: Messager.error('Error selecting arc types!', -1) elif possible == []: # nothing to select response['html'] = generate_empty_fieldset() response['keymap'] = {} response['empty'] = True else: # XXX TODO: intentionally breaking this; KB shortcuts # should no longer be sent here. Remove 'keymap' and # 'html' args once clientside generation done. arc_kb_shortcuts = {} #select_keyboard_shortcuts(possible) response['keymap'] = {} for k, p in arc_kb_shortcuts.items(): response['keymap'][k] = "arc_"+p response['html'] = generate_arc_type_html(projectconf, possible, arc_kb_shortcuts) except: Messager.error('Error selecting arc types!', -1) raise return response
def unselect(path,doc,start_list, current_id): try: from os.path import join as path_join from document import real_directory real_dir = real_directory(path) except: real_dir=path from message import Messager with getAnnObject(path, doc) as ann: proj = ProjectConfiguration(real_dir) ann_txtLvls = ann.get_textLevels() if not ann_txtLvls: return get_startlist(path,doc) txt_lvl = TextAnnotations(proj,ann_txtLvls) response_list = txt_lvl.unselect(start_list,current_id) #answerlist = txt_lvl.startlists[start_list].start #~ for i in ann_txtLvls: #~ if i.type == answerlist.name: #~ update_annotations(ann,i, txt_lvl.startlists[start_list]) #~ break if response_list: response = list_to_dict(response_list) else: response = get_startlist(path,doc) return response
def _save_svg(collection, document, svg): svg_path = _svg_path() with open_textfile(svg_path, 'w') as svg_file: svg_hdr = ('<?xml version="1.0" encoding="UTF-8" standalone="no"?>' '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" ' '"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">') defs = svg.find('</defs>') with open_textfile(CSS_PATH, 'r') as css_file: css = css_file.read() if defs != -1: css = '<style type="text/css"><![CDATA[' + css + ']]></style>' font_data = [] for font_path in SVG_FONTS: with open_textfile(font_path, 'r') as font_file: font_data.append(font_file.read().strip()) fonts = '\n'.join(font_data) svg = (svg_hdr + '\n' + svg[:defs] + '\n' + fonts + '\n' + css + '\n' + svg[defs:]) svg_file.write(svg) # Create a copy in the svg store? if SVG_STORE: real_dir = real_directory(collection, rel_to=SVG_STORE_DIR) if not exists(real_dir): makedirs(real_dir) svg_store_path = path_join(real_dir, document + '.svg') with open_textfile(svg_store_path, 'w') as svg_store_file: svg_store_file.write(svg) else: # TODO: @amadanmath: When does this actually happen? raise CorruptSVGError
def reverse_arc(collection, document, origin, target, type, attributes=None): directory = collection #undo_resp = {} # TODO real_dir = real_directory(directory) #mods = ModificationTracker() # TODO projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) if projectconf.is_equiv_type(type): Messager.warning('Cannot reverse Equiv arc') elif not projectconf.is_relation_type(type): Messager.warning('Can only reverse configured binary relations') else: # OK to reverse found = None # TODO: more sensible lookup for ann in ann_obj.get_relations(): if (ann.arg1 == origin and ann.arg2 == target and ann.type == type): found = ann break if found is None: Messager.error('reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type))) else: # found it; just adjust this found.arg1, found.arg2 = found.arg2, found.arg1 # TODO: modification tracker json_response = {} json_response['annotations'] = _json_from_ann(ann_obj) return json_response
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) #hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] hdrs = [('Content-Type', 'application/octet-stream'), ('Content-Disposition', 'inline; filename=%s' % fname)] if allowed_to_read(fpath): if not exists(fpath): data = "" if extension == "zip": import zipfile zipf = zipfile.ZipFile(fpath, 'w') zipf.close() with open(fpath, 'rb') as txt_file: data = txt_file.read() else: if extension != "zip": with open_textfile(fpath, 'r') as txt_file: data = txt_file.read().encode('utf-8') else: with open(fpath, 'rb') as txt_file: data = txt_file.read() else: data = "Access Denied" raise NoPrintJSONError(hdrs, data)
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ( '?' + url_soup.query if url_soup.query else '') try: conn.request('POST', url_soup.path, # The document text as body ann_obj.get_document_text().encode('utf8'), headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError(tagger_token, '%s %s' % (resp.status, resp.reason)) finally:
def getAnnObject2(collection,document): '''newest version of the getAnnObject methode''' try: from os.path import join as path_join from document import real_directory real_dir = real_directory(collection) except: real_dir=collection app_path = WORK_DIR + "/application/" ann = None full_name = collection + document full_name = full_name.replace("/","") if( isfile(app_path+full_name)): temp=open (app_path+full_name , 'rb') ann = pickle_load(temp) temp.close() else: ann = TextAnnotations(real_dir+document) ann = SimpleAnnotations(ann) ann.folia = {} try: #TODO:good error message ann.folia=get_extra_info(collection,document) except Exception as e: ann.folia = {} Messager.error('Error: get extra folia info() failed: %s' % e) #Validation: try: import os import simplejson as json import session docdir = os.path.dirname(ann._document) string = session.load_conf()["config"] val = json.loads(string)["validationOn"] #validate if config enables it and if it's not already done. if val: if not ann.validated: from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = [] issues = verify_annotation(ann, projectconf) else: issues = ann.issues else: ann.validated = False issues = [] except session.NoSessionError: issues = [] except KeyError: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] ann.issues = issues temp=open (app_path+full_name , 'wb') pickle_dump(ann, temp) temp.close() return ann
def download_collection(collection, include_conf=False): directory = collection real_dir = real_directory(directory) dir_name = basename(dirname(real_dir)) fname = '%s.%s' % (dir_name, 'tar.gz') confs = [ 'annotation.conf', 'visual.conf', 'tools.conf', 'kb_shortcuts.conf' ] try: include_conf = int(include_conf) except ValueError: pass tmp_file_path = None try: tmp_file_fh, tmp_file_path = mkstemp() os_close(tmp_file_fh) tar_cmd_split = ['tar', '--exclude=.stats_cache'] conf_names = [] if not include_conf: tar_cmd_split.extend(['--exclude=%s' % c for c in confs]) else: # also include configs from parent directories. for cname in confs: cdir, depth = find_in_directory_tree(real_dir, cname) if depth is not None and depth > 0: relpath = path_join(dir_name, *['..' for _ in range(depth)]) conf_names.append(path_join(relpath, cname)) if conf_names: # replace pathname components ending in ".." with target # directory name so that .confs in parent directories appear # in the target directory in the tar. tar_cmd_split.extend([ '--absolute-names', '--transform', 's|.*\\.\\.|%s|' % dir_name ]) tar_cmd_split.extend(['-c', '-z', '-f', tmp_file_path, dir_name]) tar_cmd_split.extend(conf_names) tar_p = Popen(tar_cmd_split, cwd=path_join(real_dir, '..')) tar_p.wait() hdrs = [ ('Content-Type', 'application/octet-stream'), #'application/x-tgz'), ('Content-Disposition', 'inline; filename=%s' % fname) ] with open(tmp_file_path, 'rb') as tmp_file: tar_data = tmp_file.read() raise NoPrintJSONError(hdrs, tar_data) finally: if tmp_file_path is not None: remove(tmp_file_path)
def create_arc(collection, document, origin, target, type, attributes=None, old_type=None, old_target=None, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only # TODO: make consistent across the different editing # functions, integrate ann_obj initialization and checks if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) # if there is a previous annotation and the arcs aren't in # the same category (e.g. relation vs. event arg), process # as delete + create instead of update. if old_type is not None and ( projectconf.is_relation_type(old_type) != projectconf.is_relation_type(type) or projectconf.is_equiv_type(old_type) != projectconf.is_equiv_type(type)): _delete_arc_with_ann(origin.id, old_target, old_type, mods, ann_obj, projectconf) old_target, old_type = None, None if projectconf.is_equiv_type(type): ann = _create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) elif projectconf.is_relation_type(type): ann = _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) else: ann = _create_argument(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) # process comments if ann is not None: _set_comments(ann_obj, ann, comment, mods, undo_resp=undo_resp) elif comment is not None: Messager.warning( 'create_arc: non-empty comment for None annotation (unsupported type for comment?)') mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def create_arc(collection, document, origin, target, type, attributes=None, old_type=None, old_target=None, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only # TODO: make consistent across the different editing # functions, integrate ann_obj initialization and checks if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) # if there is a previous annotation and the arcs aren't in # the same category (e.g. relation vs. event arg), process # as delete + create instead of update. if old_type is not None and ( projectconf.is_relation_type(old_type) != projectconf.is_relation_type(type) or projectconf.is_equiv_type(old_type) != projectconf.is_equiv_type(type)): _delete_arc_with_ann(origin.id, old_target, old_type, mods, ann_obj, projectconf) old_target, old_type = None, None if projectconf.is_equiv_type(type): ann =_create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) elif projectconf.is_relation_type(type): ann = _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) else: ann = _create_argument(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) # process comments if ann is not None: _set_comments(ann_obj, ann, comment, mods, undo_resp=undo_resp) elif comment is not None: Messager.warning('create_arc: non-empty comment for None annotation (unsupported type for comment?)') mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: try: # Note: Can we actually fit a whole document in here? quoted_doc_text = quote_plus(ann_obj.get_document_text()) resp = urlopen(tagger_service_url % quoted_doc_text, None, QUERY_TIMEOUT) except URLError: raise TaggerConnectionError(tagger_token) # TODO: Check for errors json_resp = loads(resp.read()) mods = ModificationTracker() for ann_data in json_resp.itervalues(): offsets = ann_data['offsets'] # Note: We do not support discontinuous spans at this point assert len(offsets) == 1, 'discontinuous/null spans' start, end = offsets[0] _id = ann_obj.get_new_id('T') tb = TextBoundAnnotationWithText( start, end, _id, ann_data['type'], ann_data['text'] ) mods.addition(tb) ann_obj.add_annotation(tb) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = "%s.%s" % (document, extension) fpath = path_join(real_dir, fname) hdrs = [("Content-Type", "text/plain; charset=utf-8"), ("Content-Disposition", "inline; filename=%s" % fname)] with open_textfile(fpath, "r") as txt_file: data = txt_file.read().encode("utf-8") raise NoPrintJSONError(hdrs, data)
def import_xmi(text, docid, collection=None): ''' TODO: DOC: ''' if len(docid) > 4 and docid[-4] == '.': docid = docid[:-4] directory = collection if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) ############################ from session import get_session try: username = get_session()['user'] except KeyError: username = None if username != 'admin': if (not username) or username + '/' not in dir_path: raise NoWritePermissionError(dir_path) ############################ base_path = join_path(dir_path, docid) xmi_path = base_path + '.zip' def decode_base64(data): import base64 missing_padding = 4 - len(data) % 4 if missing_padding: data += b'='* missing_padding return base64.decodestring(data) text = decode_base64(text[len("data:application/zip;base64,"):]) #text = text.decode('base64') with open(xmi_path, 'wb') as thefile: thefile.write(text) #with open_textfile(xmi_path, 'wb') as xmi_file: # xmi_file.write(text) return {}
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] with open_textfile(fpath, 'r') as txt_file: data = txt_file.read().encode('utf-8') raise NoPrintJSONError(hdrs, data)
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] with open_textfile(fpath, 'r') as txt_file: data = txt_file.read() raise NoPrintJSONError(hdrs, data)
def download_collection(collection, include_conf=False): directory = collection real_dir = real_directory(directory) dir_name = basename(dirname(real_dir)) fname = '%s.%s' % (dir_name, 'tar.gz') confs = ['annotation.conf', 'visual.conf', 'tools.conf', 'kb_shortcuts.conf'] try: include_conf = int(include_conf) except ValueError: pass tmp_file_path = None try: tmp_file_fh, tmp_file_path = mkstemp() os_close(tmp_file_fh) tar_cmd_split = ['tar', '--exclude=.stats_cache'] conf_names = [] if not include_conf: tar_cmd_split.extend(['--exclude=%s' % c for c in confs]) else: # also include configs from parent directories. for cname in confs: cdir, depth = find_in_directory_tree(real_dir, cname) if depth is not None and depth > 0: relpath = path_join( dir_name, *['..' for _ in range(depth)]) conf_names.append(path_join(relpath, cname)) if conf_names: # replace pathname components ending in ".." with target # directory name so that .confs in parent directories appear # in the target directory in the tar. tar_cmd_split.extend(['--absolute-names', '--transform', 's|.*\\.\\.|%s|' % dir_name]) tar_cmd_split.extend(['-c', '-z', '-f', tmp_file_path, dir_name]) tar_cmd_split.extend(conf_names) tar_p = Popen(tar_cmd_split, cwd=path_join(real_dir, '..')) tar_p.wait() hdrs = [('Content-Type', 'application/octet-stream'), # 'application/x-tgz'), ('Content-Disposition', 'inline; filename=%s' % fname)] with open(tmp_file_path, 'rb') as tmp_file: tar_data = tmp_file.read() raise NoPrintJSONError(hdrs, tar_data) finally: if tmp_file_path is not None: remove(tmp_file_path)
def __document_to_annotations(directory, document): """ Given a directory and a document, returns an Annotations object for the file. """ # TODO: put this shared functionality in a more reasonable place from document import real_directory from os.path import join as path_join real_dir = real_directory(directory) filenames = [path_join(real_dir, document)] return __filenames_to_annotations(filenames)
def logOcrFileTypeDetails(collection, document, ocrOutputResult, identificationOutputResult, identificationBoundaryOutputResult, extractionOutputResult, fileType, lossType, comments): path = path_join(real_directory(collection), 'qafunnelocrfiledetails') if not os.path.exists(path): os.mknod(path) searchFlag = False for line in fileinput.input(path, inplace=True): loaded_r = json.loads(line) docname = str(loaded_r['document']) if docname == document: searchFlag = True print "%s" % (json.dumps({ "document": document, "ocrOutputResult": ocrOutputResult, "identificationOutputResult": identificationOutputResult, "identificationBoundaryOutputResult": identificationBoundaryOutputResult, "extractionOutputResult": extractionOutputResult, "fileType": fileType, "lossType": lossType, "comments": comments }) + '\n'), else: print "%s" % (line), fileinput.close() if not searchFlag: with open(path, 'a') as file: file.write( json.dumps({ "document": document, "ocrOutputResult": ocrOutputResult, "identificationOutputResult": identificationOutputResult, "identificationBoundaryOutputResult": identificationBoundaryOutputResult, "extractionOutputResult": extractionOutputResult, "fileType": fileType, "lossType": lossType, "comments": comments }) + '\n') file.close return {"status": 'true'}
def suggest_span_types(collection, document, start, end, text, model): pconf = ProjectConfiguration(real_directory(collection)) for _, _, model_str, model_url in pconf.get_disambiguator_config(): if model_str == model: break else: # We were unable to find a matching model raise SimSemConnectionNotConfiguredError try: quoted_text = quote_plus(text) resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT) except URLError: # TODO: Could give more details raise SimSemConnectionError json = loads(resp.read()) preds = json['result'][text.decode('utf-8')] selected_preds = [] conf_sum = 0 for cat, conf in preds: selected_preds.append(( cat, conf, )) conf_sum += conf if conf_sum >= CUT_OFF: break log_annotation(collection, document, 'DONE', 'suggestion', [ None, None, text, ] + [ selected_preds, ]) # array so that server can control presentation order in UI # independently from scores if needed return { 'types': selected_preds, 'collection': collection, # echo for reference 'document': document, 'start': start, 'end': end, 'text': text, }
def getAnnObject(collection, document): try: real_dir = real_directory(collection) except: real_dir = collection app_path = WORK_DIR + "/application/" full_name = collection + document full_name = full_name.replace("/", "") if (os.path.isfile(app_path + full_name)): temp = open(app_path + full_name, 'rb') ann = pickle_load(temp) temp.close() else: ann = TextAnnotations(real_dir + document) ann = SimpleAnnotations(ann) ann.folia = {} try: #TODO:good error message ann.folia = get_extra_info(collection, document) except Exception as e: ann.folia = {} Messager.error('Error: get extra folia info() failed: %s' % e) #Validation: try: docdir = os.path.dirname(ann._document) string = session.load_conf()["config"] val = json.loads(string)["validationOn"] #validate if config enables it and if it's not already done. if val: if not ann.validated: projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann, projectconf) else: issues = ann.issues else: ann.validated = False issues = [] except session.NoSessionError: issues = [] except KeyError: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] Messager.error('Error: validation failed: %s' % e) ann.issues = issues temp = open(app_path + full_name, 'wb') pickle_dump(ann, temp) temp.close() return ann
def get_extra_info(path, fname): ''' Methode that converts extra folia annotations ''' try: from os.path import join as path_join from document import real_directory real_dir = real_directory(path) except: real_dir = path path = path_join(real_dir, fname) result = {} result["entities"] = [] result["relations"] = [] result["comments"] = [] result["attributes"] = [] result["tokens"] = {} try: doc = folia.Document(file=path + ".xml") except: return result text, offsets = parse_text(doc) #TOKEN ANNOTATIONS for i in doc.select(folia.Word): _id = offsets[i.id][0] if not _id in result["tokens"]: result["tokens"][_id] = {} index = 1 for j in i.select(folia.Morpheme): result["tokens"][_id]['Morpheme' + str(index)] = ": " + str(j) token_anns = get_token_info(j, 'mor' + str(index) + '-') string = "" for key in token_anns: string += "\n" + key + token_anns[key] result["tokens"][_id]['Morpheme' + str(index)] += string j.parent.remove(j) index += 1 if index > 1: result["tokens"][_id].update(get_token_info(i, '')) else: result["tokens"][_id] = get_token_info(i, '') #ANNOTATIONS REPRESENTED BY AN ENTITY entities, relations, comments, attributes = get_extra_entities( offsets, text, doc) result["entities"] += entities result["relations"] += relations result["comments"] += comments result["attributes"] += attributes return result
def save_import(title, text, docid, collection=None): ''' TODO: DOC: ''' directory = collection if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) base_path = join_path(dir_path, docid) txt_path = base_path + '.' + TEXT_FILE_SUFFIX ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF dat_path = base_path + '.' + DATA_FILE_SUFFIX # Before we proceed, verify that we are not overwriting for path in (txt_path, ann_path): if isfile(path): raise FileExistsError(path) # Make sure we have a valid POSIX text file, i.e. that the # file ends in a newline. if text != "" and text[-1] != '\n': text = text + '\n' with open_textfile(txt_path, 'w') as txt_file: txt_file.write(title + '\n' + text) # Touch the ann file so that we can edit the file later with open(ann_path, 'w') as _: pass # Touch the dat file so that we can edit the file later with open(dat_path, 'w') as _: pass return { 'document': docid }
def __directory_to_annotations(directory): """ Given a directory, returns Annotations objects for contained files. """ # TODO: put this shared functionality in a more reasonable place from document import real_directory,_listdir from os.path import join as path_join real_dir = real_directory(directory) # Get the document names base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')] filenames = [path_join(real_dir, bn) for bn in base_names] return __filenames_to_annotations(filenames)
def set_status(directory, document, status=None): real_dir = real_directory(directory) with TextAnnotations(path_join(real_dir, document)) as ann: # Erase all old status annotations for status in ann.get_statuses(): ann.del_annotation(status) if status is not None: # XXX: This could work, not sure if it can induce an id collision new_status_id = ann.get_new_id("#") ann.add_annotation(OnelineCommentAnnotation(new_status, new_status_id, "STATUS", "")) json_dic = {"status": new_status} return json_dic
def get_extra_info(path,fname): ''' Methode that converts extra folia annotations ''' try: from os.path import join as path_join from document import real_directory real_dir = real_directory(path) except: real_dir=path path = path_join(real_dir, fname) result = {} result["entities"] = [] result["relations"] = [] result["comments"] = [] result["attributes"] = [] result["tokens"] = {} try : doc = folia.Document(file=path+".xml") except: return result text,offsets = parse_text(doc) #TOKEN ANNOTATIONS for i in doc.select(folia.Word): _id = offsets[i.id][0] if not _id in result["tokens"]: result["tokens"][_id] = {} index = 1 for j in i.select(folia.Morpheme): result["tokens"][_id]['Morpheme'+str(index)]=": " + str(j) token_anns = get_token_info(j,'mor'+str(index)+'-') string = "" for key in token_anns: string += "\n"+key+token_anns[key] result["tokens"][_id]['Morpheme'+str(index)]+=string j.parent.remove(j) index+=1 if index > 1: result["tokens"][_id].update(get_token_info(i,'')) else: result["tokens"][_id] = get_token_info(i,'') #ANNOTATIONS REPRESENTED BY AN ENTITY entities, relations,comments, attributes = get_extra_entities(offsets,text,doc) result["entities"] += entities result["relations"] += relations result["comments"] += comments result["attributes"] += attributes return result
def save_import(text, docid, collection=None): ''' TODO: DOC: ''' directory = collection #print directory if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) base_path = join_path(dir_path, docid) #print base_path txt_path = base_path + '.' + TEXT_FILE_SUFFIX ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF # Before we proceed, verify that we are not overwriting for path in (txt_path, ann_path): if isfile(path): raise FileExistsError(path) # Make sure we have a valid POSIX text file, i.e. that the # file ends in a newline. if text != "" and text[-1] != '\n': text = text + '\n' lemmatized_text1 = lemmatizer(text) real_lemmatized_text = lemmatizer2(text) lemmatized_text = list(izip_longest(lemmatized_text1, real_lemmatized_text)) conll_text = conll(lemmatized_text) standoff_main(conll_text, docid) return {'document': docid}
def get_startlist(path,doc): try: from os.path import join as path_join from document import real_directory real_dir = real_directory(path) except: real_dir=path ann =getAnnObject(path,doc) proj = ProjectConfiguration(real_dir) try: txt_lvl = TextAnnotations(proj,ann.get_textLevels()) except NoTextLevelConf as e: return {'exception' :str(e) } response = startlist_to_dict(txt_lvl.startlists) #Back_pos tells if there is still atleast 1 answer left that can be removed response["back_pos"] = False return response
def set_status(directory, document, status=None): real_dir = real_directory(directory) with TextAnnotations(path_join(real_dir, document)) as ann: # Erase all old status annotations for status in ann.get_statuses(): ann.del_annotation(status) if status is not None: # XXX: This could work, not sure if it can induce an id collision new_status_id = ann.get_new_id('#') ann.add_annotation( OnelineCommentAnnotation(new_status, new_status_id, 'STATUS', '')) json_dic = {'status': new_status} return json_dic
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] #Folia conversion added by Sander Naert from brat2folia import convert if extension == 'xml': convert(real_dir, document) #convert to folia with open_textfile(fpath, 'r') as txt_file: data = txt_file.read().encode('utf-8') raise NoPrintJSONError(hdrs, data)
def save_import(title, text, docid, collection=None): ''' TODO: DOC: ''' directory = collection if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) base_path = join_path(dir_path, docid) txt_path = base_path + '.' + TEXT_FILE_SUFFIX ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF # Before we proceed, verify that we are not overwriting for path in (txt_path, ann_path): if isfile(path): raise FileExistsError(path) # Make sure we have a valid POSIX text file, i.e. that the # file ends in a newline. if text != "" and text[-1] != '\n': text = text + '\n' with open_textfile(txt_path, 'w') as txt_file: txt_file.write(title + '\n' + text) # Touch the ann file so that we can edit the file later with open(ann_path, 'w') as _: pass return { 'document': docid }
def save_web_page_import(url, docid, overwrite, collection=None): ''' TODO: DOC: ''' directory = collection if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) base_path = join_path(dir_path, docid) txt_path = base_path + '.' + TEXT_FILE_SUFFIX ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF # Before we proceed, verify that we are not overwriting for path in (txt_path, ann_path): if isfile(path): if not overwrite or overwrite == 'false': raise FileExistsError(path) remove(path) apiUrl = 'http://api-ie.qna.bf2.yahoo.com:4080/ie_ws/v1/ie_ws?url=' + url data = getApiData(apiUrl) # location = join_path(dir_path, 'input.json') # data = getFileData(location) try: json_resp = loads(data) except ValueError, e: raise FormatError(apiUrl, e)
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] #Folia conversion added by Sander Naert from brat2folia import convert if extension=='xml': convert(real_dir, document) #convert to folia with open_textfile(fpath, 'r') as txt_file: data = txt_file.read().encode('utf-8') raise NoPrintJSONError(hdrs, data)
def folder_import(docid, collection=None): ''' TODO: DOC: ''' directory = collection if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) ############################ from session import get_session try: username = get_session()['user'] except KeyError: username = None if username != 'admin': if (not username) or username + '/' not in dir_path: raise NoWritePermissionError(dir_path) ############################ base_path = join_path(dir_path, docid) base_path = abspath(base_path) # Before we proceed, verify that we are not overwriting if exists(base_path): raise FileExistsError(base_path) makedirs(base_path) return { 'document': base_path[len('/brat/data/'):] }
def delete_span(collection, document, id): directory = collection real_dir = real_directory(directory) document = path_join(real_dir, document) txt_file_path = document + "." + TEXT_FILE_SUFFIX with TextAnnotations(document) as ann_obj: mods = ModificationTracker() # TODO: Handle a failure to find it # XXX: Slow, O(2N) ann = ann_obj.get_ann_by_id(id) try: # Note: need to pass the tracker to del_annotation to track # recursive deletes. TODO: make usage consistent. ann_obj.del_annotation(ann, mods) try: trig = ann_obj.get_ann_by_id(ann.trigger) try: ann_obj.del_annotation(trig, mods) except DependingAnnotationDeleteError: # Someone else depended on that trigger pass except AttributeError: pass except DependingAnnotationDeleteError, e: Messager.error(e.html_error_str()) return {"exception": True} # print 'Content-Type: application/json\n' if DEBUG: mods_json = mods.json_response() else: mods_json = {} # save a roundtrip and send the annotations also txt_file_path = document + "." + TEXT_FILE_SUFFIX j_dic = _json_from_ann_and_txt(ann_obj, txt_file_path) mods_json["annotations"] = j_dic return mods_json
def suggest_span_types(collection, document, start, end, text, model): pconf = ProjectConfiguration(real_directory(collection)) for _, _, model_str, model_url in pconf.get_disambiguator_config(): if model_str == model: break else: # We were unable to find a matching model raise SimSemConnectionNotConfiguredError try: quoted_text = quote_plus(text) resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT) except URLError: # TODO: Could give more details raise SimSemConnectionError json = loads(resp.read()) preds = json['result'][text.decode('utf-8')] selected_preds = [] conf_sum = 0 for cat, conf in preds: selected_preds.append((cat, conf, )) conf_sum += conf if conf_sum >= CUT_OFF: break log_annotation(collection, document, 'DONE', 'suggestion', [None, None, text, ] + [selected_preds, ]) # array so that server can control presentation order in UI # independently from scores if needed return { 'types': selected_preds, 'collection': collection, # echo for reference 'document': document, 'start': start, 'end': end, 'text': text, }
def get_list(path,doc): try: from os.path import join as path_join from document import real_directory real_dir = real_directory(path) except: real_dir=path ann =getAnnObject(path,doc) proj = ProjectConfiguration(real_dir) try: txt_lvl = TextAnnotations(proj,ann.get_textLevels()) except NoTextLevelConf as e: return {'exception' :str(e) } #~ if txt_lvl.currentList == "stop": #~ return {'stop':True, 'annotation':str(txt_lvl.selectedList.ann),} response = list_to_dict(txt_lvl.selectedList.currentList) #Back_pos tells if there is still atleast 1 answer left that can be removed response["back_pos"] = False if len(txt_lvl.followed_path) >0 : response["back_pos"] = True return response
def delete_arc(collection, document, origin, target, type): directory = collection real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) _delete_arc_with_ann(origin, target, type, mods, ann_obj, projectconf) mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def delete_span(collection, document, id): directory = collection real_dir = real_directory(directory) document = path_join(real_dir, document) txt_file_path = document + '.' + TEXT_FILE_SUFFIX with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() #TODO: Handle a failure to find it #XXX: Slow, O(2N) ann = ann_obj.get_ann_by_id(id) try: # Note: need to pass the tracker to del_annotation to track # recursive deletes. TODO: make usage consistent. ann_obj.del_annotation(ann, mods) try: trig = ann_obj.get_ann_by_id(ann.trigger) try: ann_obj.del_annotation(trig, mods) except DependingAnnotationDeleteError: # Someone else depended on that trigger pass except AttributeError: pass except DependingAnnotationDeleteError, e: Messager.error(e.html_error_str()) return { 'exception': True, } mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def convert(path, fname): from message import Messager try: from os.path import join as path_join from document import real_directory real_dir = real_directory(path) except: real_dir = path full_path = path_join(real_dir, fname) entity_ids = {} try: doc = folia.Document(file=full_path + ".xml") temp = open(full_path + ".ann", 'w') txt = open(full_path + ".txt", 'w') except IOError as e: Messager.error("IOError " + str(e)) return { 'result': False, } ann_obj = TextAnnotations(full_path) text, offsets = parse_text(doc) with SimpleAnnotations(ann_obj) as ann: add_entities(doc, ann, entity_ids, offsets, text) add_relations(doc, ann, entity_ids, offsets) add_attributes(doc, ann, entity_ids) add_comments(doc, ann, entity_ids) try: ann.folia = get_extra_info(path, fname) except: Messager.error("get_extra_info() from folia failed") ann.folia = {} txt.write(text) txt.close() #~ temp.write(str(ann)) #~ temp.close() make_conf_file(real_dir, ann) #return is needed for client, so it can see the function is done, this can take a few seconds return { 'result': True, }
def _get_db_path(database, collection): if collection is None: # TODO: default to WORK_DIR config? return None else: try: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() for entry in norm_conf: dbname, dbpath = entry[0], entry[3] if dbname == database: return dbpath # not found in config. Messager.warning('DB ' + database + ' not defined in config for ' + collection + ', falling back on default.') return None except Exception: # whatever goes wrong, just warn and fall back on the default. Messager.warning('Failed to get DB path from config for ' + collection + ', falling back on default.') return None
def reverse_arc(collection, document, origin, target, type, attributes=None): directory = collection # undo_resp = {} # TODO real_dir = real_directory(directory) # mods = ModificationTracker() # TODO projectconf = ProjectConfiguration(real_dir) document = urllib.parse.unquote(document) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) if projectconf.is_equiv_type(type): Messager.warning('Cannot reverse Equiv arc') elif not projectconf.is_relation_type(type): Messager.warning('Can only reverse configured binary relations') else: # OK to reverse found = None # TODO: more sensible lookup for ann in ann_obj.get_relations(): if (ann.arg1 == origin and ann.arg2 == target and ann.type == type): found = ann break if found is None: Messager.error( 'reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type))) else: # found it; just adjust this found.arg1, found.arg2 = found.arg2, found.arg1 # TODO: modification tracker json_response = {} json_response['annotations'] = _json_from_ann(ann_obj) return json_response
def _create_span(collection, document, offsets, _type, attributes=None, normalizations=None, _id=None, comment=None): if _offset_overlaps(offsets): raise SpanOffsetOverlapError(offsets) directory = collection undo_resp = {} _attributes = _parse_attributes(attributes) _normalizations = _parse_span_normalizations(normalizations) #log_info('ATTR: %s' %(_attributes, )) real_dir = real_directory(directory) document = path_join(real_dir, document) projectconf = ProjectConfiguration(real_dir) txt_file_path = document + '.' + TEXT_FILE_SUFFIX path_split(document)[0] with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() if _id is not None: # We are to edit an existing annotation tb_ann, e_ann = _edit_span(ann_obj, mods, _id, offsets, projectconf, _attributes, _type, undo_resp=undo_resp) else: # We are to create a new annotation tb_ann, e_ann = __create_span( ann_obj, mods, _type, offsets, txt_file_path, projectconf, _attributes) undo_resp['action'] = 'add_tb' if e_ann is not None: undo_resp['id'] = e_ann.id else: undo_resp['id'] = tb_ann.id # Determine which annotation attributes, normalizations, # comments etc. should be attached to. If there's an event, # attach to that; otherwise attach to the textbound. if e_ann is not None: # Assign to the event, not the trigger target_ann = e_ann else: target_ann = tb_ann # Set attributes _set_attributes(ann_obj, target_ann, _attributes, mods, undo_resp=undo_resp) # Set normalizations _set_normalizations(ann_obj, target_ann, _normalizations, mods, undo_resp=undo_resp) # Set comments if tb_ann is not None: _set_comments(ann_obj, target_ann, comment, mods, undo_resp=undo_resp) if tb_ann is not None: mods_json = mods.json_response() else: # Hack, probably we had a new-line in the span mods_json = {} Messager.error( 'Text span contained new-line, rejected', duration=3) if undo_resp: mods_json['undo'] = json_dumps(undo_resp) mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def split_span(collection, document, args, id): directory = collection real_dir = real_directory(directory) document = path_join(real_dir, document) # TODO don't know how to pass an array directly, so doing extra catenate # and split tosplit_args = json_loads(args) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() ann = ann_obj.get_ann_by_id(id) # currently only allowing splits for events if not isinstance(ann, EventAnnotation): raise AnnotationSplitError( "Cannot split an annotation of type %s" % ann.type) # group event arguments into ones that will be split on and # ones that will not, placing the former into a dict keyed by # the argument without trailing numbers (e.g. "Theme1" -> # "Theme") and the latter in a straight list. split_args = {} nonsplit_args = [] import re for arg, aid in ann.args: m = re.match(r'^(.*?)\d*$', arg) if m: arg = m.group(1) if arg in tosplit_args: if arg not in split_args: split_args[arg] = [] split_args[arg].append(aid) else: nonsplit_args.append((arg, aid)) # verify that split is possible for a in tosplit_args: acount = len(split_args.get(a, [])) if acount < 2: raise AnnotationSplitError( "Cannot split %s on %s: only %d %s arguments (need two or more)" % (ann.id, a, acount, a)) # create all combinations of the args on which to split argument_combos = [[]] for a in tosplit_args: new_combos = [] for aid in split_args[a]: for c in argument_combos: new_combos.append(c + [(a, aid)]) argument_combos = new_combos # create the new events (first combo will use the existing event) from copy import deepcopy new_events = [] for i, arg_combo in enumerate(argument_combos): # tweak args if i == 0: ann.args = nonsplit_args[:] + arg_combo else: newann = deepcopy(ann) # TODO: avoid hard-coding ID prefix newann.id = ann_obj.get_new_id("E") newann.args = nonsplit_args[:] + arg_combo ann_obj.add_annotation(newann) new_events.append(newann) mods.addition(newann) # then, go through all the annotations referencing the original # event, and create appropriate copies for a in ann_obj: soft_deps, hard_deps = a.get_deps() refs = soft_deps | hard_deps if ann.id in refs: # Referenced; make duplicates appropriately if isinstance(a, EventAnnotation): # go through args and make copies for referencing new_args = [] for arg, aid in a.args: if aid == ann.id: for newe in new_events: new_args.append((arg, newe.id)) a.args.extend(new_args) elif isinstance(a, AttributeAnnotation): for newe in new_events: newmod = deepcopy(a) newmod.target = newe.id # TODO: avoid hard-coding ID prefix newmod.id = ann_obj.get_new_id("A") ann_obj.add_annotation(newmod) mods.addition(newmod) elif isinstance(a, BinaryRelationAnnotation): # TODO raise AnnotationSplitError( "Cannot adjust annotation referencing split: not implemented for relations! (WARNING: annotations may be in inconsistent state, please reload!) (Please complain to the developers to fix this!)") elif isinstance(a, OnelineCommentAnnotation): for newe in new_events: newcomm = deepcopy(a) newcomm.target = newe.id # TODO: avoid hard-coding ID prefix newcomm.id = ann_obj.get_new_id("#") ann_obj.add_annotation(newcomm) mods.addition(newcomm) elif isinstance(a, NormalizationAnnotation): for newe in new_events: newnorm = deepcopy(a) newnorm.target = newe.id # TODO: avoid hard-coding ID prefix newnorm.id = ann_obj.get_new_id("N") ann_obj.add_annotation(newnorm) mods.addition(newnorm) else: raise AnnotationSplitError( "Cannot adjust annotation referencing split: not implemented for %s! (Please complain to the lazy developers to fix this!)" % a.__class__) mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) print("tagger", tagger, file=sys.stderr) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) path_join(real_directory(collection), document) # print("path_join(real_directory(collection), document)", path_join(real_directory(collection), document), file=sys.stderr) # print("tagger_token", tagger_token, file=sys.stderr) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: # print("ann_obj", document, file=sys.stderr) url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection # print("HTTPConnection", HTTPConnection, file=sys.stderr) elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from http.client import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: # Note: Trout slapping for anyone sending Unicode objects here data = str(path_join( real_directory(collection), document)) + "#*^$#" + ann_obj.get_document_text() data = data.encode('utf-8') # print("data", type(data),data, file=sys.stderr) # print("data", ann_obj, file=sys.stderr) req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request( 'POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(service_url), data, headers=req_headers) # httpConnection = http.client.HTTPConnection(url_soup.netloc) # httpConnection.request('GET', str(service_url), headers=req_headers) # response = httpConnection.getresponse() except SocketError as e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # print("resp-------------", resp.read(), file=sys.stderr) # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally: if conn is not None: conn.close() try: json_resp = loads(resp_data) # print("json_resp", json_resp, file=sys.stderr) except ValueError: raise InvalidTaggerResponseError(tagger_token, resp_data) mods = ModificationTracker() cidmap = {} # print("json_resp.items:::::::::::::", json_resp.items(), file=sys.stderr) for cid, ann in ((i, a) for i, a in json_resp.items() if _is_textbound(a)): assert 'offsets' in ann, 'Tagger response lacks offsets' offsets = ann['offsets'] # print("json_resp.items:::::::::::::", offsets, file=sys.stderr) assert 'type' in ann, 'Tagger response lacks type' _type = ann['type'] assert 'texts' in ann, 'Tagger response lacks texts' texts = ann['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets' assert len(texts) == len( offsets ), 'Tagger response has different numbers of offsets and texts' start, end = offsets[0] text = texts[0] # print("offsets, _type, texts, text:", offsets, _type, texts, text, file=sys.stderr) _id = ann_obj.get_new_id('T') print("_id", _id, file=sys.stderr) cidmap[cid] = _id tb = TextBoundAnnotationWithText(offsets, _id, _type, text, " " + ' '.join(texts[1:])) mods.addition(tb) ann_obj.add_annotation(tb) for norm in (a for a in json_resp.values() if _is_normalization(a)): try: _type = norm['type'] target = norm['target'] refdb = norm['refdb'] refid = norm['refid'] except KeyError as e: raise # TODO _id = ann_obj.get_new_id('N') target = cidmap[target] na = NormalizationAnnotation(_id, _type, target, refdb, refid, '') mods.addition(na) ann_obj.add_annotation(na) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp
def delete_arc(collection, document, origin, target, type): directory = collection real_dir = real_directory(directory) document = path_join(real_dir, document) txt_file_path = document + '.' + TEXT_FILE_SUFFIX with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() # This can be an event or an equiv #TODO: Check for None! try: event_ann = ann_obj.get_ann_by_id(origin) # Try if it is an event arg_tup = (type, unicode(target)) if arg_tup in event_ann.args: before = unicode(event_ann) event_ann.args.remove(arg_tup) mods.change(before, event_ann) ''' if not event_ann.args: # It was the last argument tuple, remove it all try: ann_obj.del_annotation(event_ann) mods.deletion(event_ann) except DependingAnnotationDeleteError, e: #XXX: Old message api print 'Content-Type: application/json\n' print dumps(e.json_error_response()) return ''' else: # What we were to remove did not even exist in the first place pass except AttributeError: projectconf = ProjectConfiguration(real_dir) if projectconf.is_equiv_type(type): # It is an equiv then? #XXX: Slow hack! Should have a better accessor! O(eq_ann) for eq_ann in ann_obj.get_equivs(): # We don't assume that the ids only occur in one Equiv, we # keep on going since the data "could" be corrupted if (unicode(origin) in eq_ann.entities and unicode(target) in eq_ann.entities): before = unicode(eq_ann) eq_ann.entities.remove(unicode(origin)) eq_ann.entities.remove(unicode(target)) mods.change(before, eq_ann) if len(eq_ann.entities) < 2: # We need to delete this one try: ann_obj.del_annotation(eq_ann) mods.deletion(eq_ann) except DependingAnnotationDeleteError, e: #TODO: This should never happen, dep on equiv #print 'Content-Type: application/json\n' # TODO: Proper exception here! Messager.error(e.json_error_response()) return {} elif type in projectconf.get_relation_types(): for ann in ann_obj.get_relations(): if ann.type == type and ann.arg1 == origin and ann.arg2 == target: ann_obj.del_annotation(ann) mods.deletion(ann) break else: