def possible_arc_types(collection, origin_type, target_type): directory = collection real_dir = real_directory(directory) projectconf = ProjectConfiguration(real_dir) response = {} try: possible = projectconf.arc_types_from_to(origin_type, target_type) # TODO: proper error handling if possible is None: Messager.error('Error selecting arc types!', -1) elif possible == []: # nothing to select response['html'] = generate_empty_fieldset() response['keymap'] = {} response['empty'] = True else: # XXX TODO: intentionally breaking this; KB shortcuts # should no longer be sent here. Remove 'keymap' and # 'html' args once clientside generation done. arc_kb_shortcuts = {} #select_keyboard_shortcuts(possible) response['keymap'] = {} for k, p in arc_kb_shortcuts.items(): response['keymap'][k] = "arc_"+p response['html'] = generate_arc_type_html(projectconf, possible, arc_kb_shortcuts) except: Messager.error('Error selecting arc types!', -1) raise return response
def _server_crash(cookie_hdrs, e): from config import ADMIN_CONTACT_EMAIL, DEBUG from jsonwrap import dumps from message import Messager stack_trace = _get_stack_trace() if DEBUG: # Send back the stack-trace as json error_msg = '\n'.join(('Server Python crash, stack-trace is:\n', stack_trace)) Messager.error(error_msg, duration=-1) else: # Give the user an error message # Use the current time since epoch as an id for later log look-up error_msg = ('The server encountered a serious error, ' 'please contact the administrators at %s ' 'and give the id #%d' ) % (ADMIN_CONTACT_EMAIL, int(time())) Messager.error(error_msg, duration=-1) # Print to stderr so that the exception is logged by the webserver print(stack_trace, file=stderr) json_dic = { 'exception': 'serverCrash', } return (cookie_hdrs, ((JSON_HDR, ), dumps(Messager.output_json(json_dic))))
def _server_crash(cookie_hdrs, e): from config import ADMIN_CONTACT_EMAIL, DEBUG from jsonwrap import dumps from message import Messager stack_trace = _get_stack_trace() if DEBUG: # Send back the stack-trace as json error_msg = '\n'.join(('Server Python crash, stack-trace is:\n', stack_trace)) Messager.error(error_msg, duration=-1) else: # Give the user an error message # Use the current time since epoch as an id for later log look-up error_msg = ('The server encountered a serious error, ' 'please contact the administrators at %s ' 'and give the id #%d' ) % (ADMIN_CONTACT_EMAIL, int(time())) Messager.error(error_msg, duration=-1) # Print to stderr so that the exception is logged by the webserver print(stack_trace, file=sys.stderr) json_dic = { 'exception': 'serverCrash', } return (cookie_hdrs, ((JSON_HDR, ), dumps(Messager.output_json(json_dic))))
def ann_logger(directory): """ Lazy initializer for the annotation logger. Returns None if annotation logging is not configured for the given directory and a logger otherwise. """ if ann_logger.__logger == False: # not initialized annlogfile = options_get_annlogfile(directory) if annlogfile == '<NONE>': # not configured ann_logger.__logger = None else: # initialize try: l = logging.getLogger('annotation') l.setLevel(logging.INFO) handler = logging.FileHandler(annlogfile) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s\t%(message)s') handler.setFormatter(formatter) l.addHandler(handler) ann_logger.__logger = l except IOError, e: Messager.error("""Error: failed to initialize annotation log %s: %s. Edit action not logged. Please check the Annotation-log logfile setting in tools.conf""" % (annlogfile, e)) logging.error("Failed to initialize annotation log %s: %s" % (annlogfile, e)) ann_logger.__logger = None
def ann_logger(directory): """ Lazy initializer for the annotation logger. Returns None if annotation logging is not configured for the given directory and a logger otherwise. """ if ann_logger.__logger == False: # not initialized annlogfile = options_get_annlogfile(directory) if annlogfile == '<NONE>': # not configured ann_logger.__logger = None else: # initialize try: l = logging.getLogger('annotation') l.setLevel(logging.INFO) handler = logging.FileHandler(annlogfile) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s\t%(message)s') handler.setFormatter(formatter) l.addHandler(handler) ann_logger.__logger = l except IOError as e: Messager.error( """Error: failed to initialize annotation log %s: %s. Edit action not logged. Please check the Annotation-log logfile setting in tools.conf""" % (annlogfile, e)) logging.error("Failed to initialize annotation log %s: %s" % (annlogfile, e)) ann_logger.__logger = None return ann_logger.__logger
def _config_check(): from message import Messager from sys import path from copy import deepcopy from os.path import dirname # Reset the path to force config.py to be in the root (could be hacked # using __init__.py, but we can be monkey-patched anyway) orig_path = deepcopy(path) # Can't you empty in O(1) instead of O(N)? while path: path.pop() path.append(path_join(abspath(dirname(__file__)), '../..')) # Check if we have a config, otherwise whine try: import config del config except ImportError, e: path.extend(orig_path) # "Prettiest" way to check specific failure if e.message == 'No module named config': Messager.error(_miss_config_msg(), duration=-1) else: Messager.error(_get_stack_trace(), duration=-1) raise ConfigurationError
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS): """ Given a list of strings, a DB name, and simstring options, builds a simstring DB for the strings. """ try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError dbfn = __ssdb_path(dbname) try: # only library defaults (n=3, no marks) supported just now (TODO) assert ngram_length == 3, "Error: unsupported n-gram length" assert include_marks == False, "Error: begin/end marks not supported" db = simstring.writer(dbfn) for s in strs: db.insert(s) db.close() except: print >> sys.stderr, "Error building simstring DB" raise return dbfn
def ssdb_supstring_exists(s, dbname, threshold=DEFAULT_THRESHOLD): """Given a string s and a DB name, returns whether at least one string in the associated simstring DB likely contains s as an (approximate) substring.""" try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError if threshold == 1.0: # optimized (not hugely, though) for this common case db = ssdb_open(dbname.encode('UTF-8')) __set_db_measure(db, 'overlap') db.threshold = threshold result = db.retrieve(s) db.close() # assume simstring DBs always contain UTF-8 - encoded strings result = [r.decode('UTF-8') for r in result] for r in result: if s in r: return True return False else: # naive implementation for everything else return len(ssdb_supstring_lookup(s, dbname, threshold)) != 0
def set_Ann_state(self, directory, file, state): real_dir = real_directory(directory) assert_allowed_to_read(real_dir) # check and update try: cursor = self.conn.cursor() cursor.execute("""BEGIN TRANSACTION""") cursor.execute( """SELECT userName FROM Ann WHERE fileDirAbs = ? and fileName = ?;""", (directory, file)) rows = cursor.fetchall() if len(rows) == 0: cursor.execute( """UPDATE Ann SET state = ? WHERE fileDirAbs = ? and fileName = ?;""", (state, directory, file)) except sqlite3.Error as e: # print("Database error: %s" % e, file=sys.stderr) Messager.error("Database error: %s" % e) self.conn.rollback() except Exception as e: # print("Exception in _query: %s" % e, file=sys.stderr) Messager.error("Exception in _query: %s" % e) self.conn.rollback() finally: cursor.execute("COMMIT") cursor.close()
def reverse_arc(collection, document, origin, target, type, attributes=None): directory = collection #undo_resp = {} # TODO real_dir = real_directory(directory) #mods = ModificationTracker() # TODO projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) if projectconf.is_equiv_type(type): Messager.warning('Cannot reverse Equiv arc') elif not projectconf.is_relation_type(type): Messager.warning('Can only reverse configured binary relations') else: # OK to reverse found = None # TODO: more sensible lookup for ann in ann_obj.get_relations(): if (ann.arg1 == origin and ann.arg2 == target and ann.type == type): found = ann break if found is None: Messager.error('reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type))) else: # found it; just adjust this found.arg1, found.arg2 = found.arg2, found.arg1 # TODO: modification tracker json_response = {} json_response['annotations'] = _json_from_ann(ann_obj) return json_response
def __init__(self): # 连接到SQLite数据库 # 数据库文件是DB_FNAME,如果文件不存在,会自动在当前目录创建 flag_exist = os.path.isfile(DB_FNAME) self.conn = sqlite3.connect(DB_FNAME) if flag_exist: return None try: cursor = self.conn.cursor() cursor.execute(_CREATE_ANN_SQL) self.conn.commit() except sqlite3.Error as e: # print("Database error: %s" % e, file=sys.stderr) Messager.error("Database error: %s" % e) self.conn.rollback() self.conn.close() except Exception as e: # print("Exception in _query: %s" % e, file=sys.stderr) Messager.error("Exception in _query: %s" % e) self.conn.rollback() self.conn.close() finally: cursor.close() en_import_DATA = True if en_import_DATA: for dir in [ x[0].replace(DATA_DIR, '') + '/' for x in os.walk(DATA_DIR) ]: if len(dir) > 1: self.import_files(dir) return None
def import_files(self, directory): real_dir = real_directory(directory) assert_allowed_to_read(real_dir) # Get the document names file_names = [ fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt') ] try: cursor = self.conn.cursor() for filename in file_names: state, fid, fileName, fileDirAbs, uid, userName = Ann_NULL, 0, filename, directory, 0, None cursor.execute( _INSERT_ANN_SQL, (state, fid, fileName, fileDirAbs, uid, userName)) self.conn.commit() except sqlite3.Error as e: # print("Database error: %s" % e, file=sys.stderr) Messager.error("Database error: %s" % e) self.conn.rollback() except Exception as e: # print("Exception in _query: %s" % e, file=sys.stderr) Messager.error("Exception in _query: %s" % e) self.conn.rollback() finally: cursor.close()
def ann_logger(): """ Lazy initializer for the annotation logger. Returns None if annotation logging is not configured and a logger otherwise. """ if ann_logger.__logger == False: # not initialized if ANNOTATION_LOG is None: # not configured ann_logger.__logger = None else: # initialize try: l = logging.getLogger('annotation') l.setLevel(logging.INFO) handler = logging.FileHandler(ANNOTATION_LOG) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s\t%(message)s') handler.setFormatter(formatter) l.addHandler(handler) ann_logger.__logger = l except IOError, e: Messager.error( """Error: failed to initialize annotation log %s: %s. Edit action not logged. Please check ANNOTATION_LOG setting in config.py""" % (ANNOTATION_LOG, e)) logging.error("Failed to initialize annotation log %s: %s" % (ANNOTATION_LOG, e)) ann_logger.__logger = None
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS): ''' Given a list of strings, a DB name, and simstring options, builds a simstring DB for the strings. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError dbfn = __ssdb_path(dbname) try: # only library defaults (n=3, no marks) supported just now (TODO) assert ngram_length == 3, "Error: unsupported n-gram length" assert include_marks == False, "Error: begin/end marks not supported" db = simstring.writer(dbfn) for s in strs: db.insert(s) db.close() except: print >> sys.stderr, "Error building simstring DB" raise return dbfn
def _get_match_regex(text, text_match="word", match_case=False, whole_string=False): """ Helper for the various search_anns_for_ functions. """ if match_case: regex_flags = 0 else: regex_flags = re.IGNORECASE if text is None: text = '' if text_match == "word": # full word match: require word boundaries or, optionally, # whole string boundaries if whole_string: return re.compile(r'^'+re.escape(text)+r'$', regex_flags) else: return re.compile(r'\b'+re.escape(text)+r'\b', regex_flags) elif text_match == "substring": # any substring match, as text (nonoverlapping matches) return re.compile(re.escape(text), regex_flags) elif text_match == "regex": try: return re.compile(text, regex_flags) except: # whatever (sre_constants.error, other?) Messager.warning('Given string "%s" is not a valid regular expression.' % text) return None else: Messager.error('Unrecognized search match specification "%s"' % text_match) return None
def ssdb_supstring_exists(s, dbname, threshold=DEFAULT_THRESHOLD): ''' Given a string s and a DB name, returns whether at least one string in the associated simstring DB likely contains s as an (approximate) substring. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError if threshold == 1.0: # optimized (not hugely, though) for this common case db = ssdb_open(dbname.encode('UTF-8')) __set_db_measure(db, 'overlap') db.threshold = threshold result = db.retrieve(s) db.close() # assume simstring DBs always contain UTF-8 - encoded strings result = [r.decode('UTF-8') for r in result] s = s.decode('UTF-8') for r in result: if s in r: return True return False else: # naive implementation for everything else return len(ssdb_supstring_lookup(s, dbname, threshold)) != 0
def retrieve_stored(document, suffix): stored_path = _stored_path()+'.'+suffix if not isfile(stored_path): # @ninjin: not sure what 'version' was supposed to be returned # here, but none was defined, so returning that # raise NoSVGError(version) raise NoSVGError('None') filename = document+'.'+suffix # sorry, quick hack to get the content-type right # TODO: send this with initial 'stored' response instead of # guessing on suffix if suffix == SVG_SUFFIX: content_type = 'image/svg+xml' elif suffix == PNG_SUFFIX: content_type = 'image/png' elif suffix == PDF_SUFFIX: content_type = 'application/pdf' elif suffix == EPS_SUFFIX: content_type = 'application/postscript' else: Messager.error('Unknown suffix "%s"; cannot determine Content-Type' % suffix) # TODO: reasonable backoff value content_type = None # Bail out with a hack since we violated the protocol hdrs = [('Content-Type', content_type), ('Content-Disposition', 'inline; filename=' + filename)] with open(stored_path, 'rb') as stored_file: data = stored_file.read() raise NoPrintJSONError(hdrs, data)
def ann_logger(): """ Lazy initializer for the annotation logger. Returns None if annotation logging is not configured and a logger otherwise. """ if ann_logger.__logger == False: # not initialized if ANNOTATION_LOG is None: # not configured ann_logger.__logger = None else: # initialize try: l = logging.getLogger('annotation') l.setLevel(logging.INFO) handler = logging.FileHandler(ANNOTATION_LOG) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s\t%(message)s') handler.setFormatter(formatter) l.addHandler(handler) ann_logger.__logger = l except IOError, e: Messager.error("""Error: failed to initialize annotation log %s: %s. Edit action not logged. Please check ANNOTATION_LOG setting in config.py""" % (ANNOTATION_LOG, e)) logging.error("Failed to initialize annotation log %s: %s" % (ANNOTATION_LOG, e)) ann_logger.__logger = None
def _parse_relation_annotation(self, id, data, data_tail, input_file_path): try: type_delim = data.index(' ') type, type_tail = (data[:type_delim], data[type_delim:]) except ValueError: # cannot have a relation with just a type (contra event) raise IdedAnnotationLineSyntaxError(id, self.ann_line, self.ann_line_num+1, input_file_path) try: args = [tuple(arg.split(':')) for arg in type_tail.split()] except ValueError: raise IdedAnnotationLineSyntaxError(id, self.ann_line, self.ann_line_num+1, input_file_path) if len(args) != 2: Messager.error('Error parsing relation: must have exactly two arguments') raise IdedAnnotationLineSyntaxError(id, self.ann_line, self.ann_line_num+1, input_file_path) args.sort() if args[0][0] == args[1][0]: Messager.error('Error parsing relation: arguments must not be identical') raise IdedAnnotationLineSyntaxError(id, self.ann_line, self.ann_line_num+1, input_file_path) return BinaryRelationAnnotation(id, type, args[0][0], args[0][1], args[1][0], args[1][1], data_tail, source_id=input_file_path)
def allowed_to_read(real_path): data_path = path_join('/', relpath(real_path, DATA_DIR)) # add trailing slash to directories, required to comply to robots.txt if isdir(real_path): data_path = '%s/' % (data_path) real_dir = dirname(real_path) robotparser = ProjectConfiguration(real_dir).get_access_control() if robotparser is None: return True # default allow # 目录读取权限 try: user = get_session().get('user') if user is None: Messager.error('没有登录!', duration=3) user = '******' except KeyError: Messager.error('没有登录!', duration=3) return False # print(user, file=sys.stderr) # display_message('Path: %s, dir: %s, user: %s, ' % (data_path, real_dir, user), type='error', duration=-1) # / tutorials / # / tutorials / # / tutorials / bio / # / tutorials / news / # / tutorials / # / tutorials / bio / # / tutorials / news / # print(data_path, file=sys.stderr) return robotparser.can_fetch(user, data_path)
def _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target, undo_resp={}): attributes = _parse_attributes(attributes) if old_type is not None or old_target is not None: assert type in projectconf.get_relation_types(), ( ('attempting to convert relation to non-relation "%s" ' % (target.type, )) + ('(legit types: %s)' % (unicode(projectconf.get_relation_types()), ))) sought_target = (old_target if old_target is not None else target.id) sought_type = (old_type if old_type is not None else type) sought_origin = origin.id # We are to change the type, target, and/or attributes found = None for ann in ann_obj.get_relations(): if (ann.arg1 == sought_origin and ann.arg2 == sought_target and ann.type == sought_type): found = ann break if found is None: # TODO: better response Messager.error('_create_relation: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target))) elif found.arg2 == target.id and found.type == type: # no changes to type or target pass else: # type and/or target changed, mark. before = unicode(found) found.arg2 = target.id found.type = type mods.change(before, found) target_ann = found else: # Create a new annotation new_id = ann_obj.get_new_id('R') # TODO: do we need to support different relation arg labels # depending on participant types? This doesn't. rels = projectconf.get_relations_by_type(type) rel = rels[0] if rels else None assert rel is not None and len(rel.arg_list) == 2 a1l, a2l = rel.arg_list ann = BinaryRelationAnnotation(new_id, type, a1l, origin.id, a2l, target.id, '\t') mods.addition(ann) ann_obj.add_annotation(ann) target_ann = ann # process attributes if target_ann is not None: _set_attributes(ann_obj, ann, attributes, mods, undo_resp) elif attributes != None: Messager.error('_create_relation: cannot set arguments: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target))) return target_ann
def filter_folia(ann_obj): forbidden_ann=[] response = {"entities":[],"comments":[],"relations":[],"attributes":[],"tokens":{}} try: import simplejson as json import session string = session.load_conf()["config"] val = json.loads(string)["foliaLayers"] except session.NoSessionError: val = [] except KeyError: val = [] pass except Exception as e: val = [] Messager.error("Error while enabling/disabling folia layers: "+str(e)) pass try: response["tokens"]=ann_obj.folia["tokens"] except KeyError as e: pass if val: removed = set() forbidden = set(i for i in val) result = [] alternatives = "alter" in val try: if 'all' in val: response["tokens"]={} return response else: for i in ann_obj.folia["entities"]: if not i[3] in forbidden and not ( i[4] and alternatives ): result.append(i) else: removed.add(i[0]) response["entities"] = result result = [] for i in ann_obj.folia["relations"]: if not i[3] in forbidden and not i[2][0][1] in removed and not i[2][1][1] in removed and not ( i[4] and alternatives ): result.append(i) else: removed.add(i[0]) response["relations"] = result result = [] for i in ann_obj.folia["attributes"]: if not i[2] in removed: result.append(i) response["attributes"] = result result = [] for i in ann_obj.folia["comments"]: if not i[0] in removed: result.append(i) response["comments"] = result except KeyError: pass else: response = ann_obj.folia return response
def _safe_serve(params, client_ip, client_hostname, cookie_data): # Note: Only logging imports here from config import WORK_DIR from logging import basicConfig as log_basic_config # Enable logging try: from config import LOG_LEVEL log_level = _convert_log_level(LOG_LEVEL) except ImportError: from logging import WARNING as LOG_LEVEL_WARNING log_level = LOG_LEVEL_WARNING log_basic_config(filename=path_join(WORK_DIR, "server.log"), level=log_level) # Do the necessary imports after enabling the logging, order critical try: from common import ProtocolError, ProtocolArgumentError, NoPrintJSONError from dispatch import dispatch from jsonwrap import dumps from message import Messager from session import get_session, init_session, close_session, NoSessionError, SessionStoreError except ImportError: # Note: Heisenbug trap for #612, remove after resolved from logging import critical as log_critical from sys import path as sys_path log_critical("Heisenbug trap reports: " + str(sys_path)) raise init_session(client_ip, cookie_data=cookie_data) response_is_JSON = True try: # Unpack the arguments into something less obscure than the # Python FieldStorage object (part dictonary, part list, part FUBAR) http_args = DefaultNoneDict() for k in params: # Also take the opportunity to convert Strings into Unicode, # according to HTTP they should be UTF-8 try: http_args[k] = unicode(params.getvalue(k), encoding="utf-8") except TypeError: Messager.error( "protocol argument error: expected string argument %s, got %s" % (k, type(params.getvalue(k))) ) raise ProtocolArgumentError # Dispatch the request json_dic = dispatch(http_args, client_ip, client_hostname) except ProtocolError, e: # Internal error, only reported to client not to log json_dic = {} e.json(json_dic) # Add a human-readable version of the error err_str = str(e) if err_str != "": Messager.error(err_str, duration=-1)
def __read_term_hierarchy(input): root_nodes = [] last_node_at_depth = {} macros = {} for l in input: # skip empties and lines starting with '#' if l.strip() == '' or re.match(r'^\s*#', l): continue # interpret lines of only hyphens as separators # for display if re.match(r'^\s*-+\s*$', l): # TODO: proper placeholder and placing root_nodes.append(SEPARATOR_STR) continue # interpret lines of the format <STR1>=STR2 as "macro" # definitions, defining <STR1> as a placeholder that should be # replaced with STR2 whevever it occurs. m = re.match(r'^<([a-zA-Z_-]+)>=\s*(.*?)\s*$', l) if m: name, value = m.groups() if name in reserved_macro_name: Messager.error("Cannot redefine <%s> in configuration, it is a reserved name." % name) # TODO: proper exception assert False else: macros["<%s>" % name] = value continue # macro expansion for n in macros: l = l.replace(n, macros[n]) m = re.match(r'^(\s*)([^\t]+)(?:\t(.*))?$', l) assert m, "Error parsing line: '%s'" % l indent, terms, args = m.groups() terms = [t.strip() for t in terms.split("|") if t.strip() != ""] if args is None or args.strip() == "": args = [] else: args = [a.strip() for a in args.split(",") if a.strip() != ""] # depth in the ontology corresponds to the number of # spaces in the initial indent. depth = len(indent) n = TypeHierarchyNode(terms, args) if depth == 0: # root level, no children assignments root_nodes.append(n) else: # assign as child of last node at the depth of the parent assert depth-1 in last_node_at_depth, "Error: no parent for '%s'" % l last_node_at_depth[depth-1].children.append(n) last_node_at_depth[depth] = n return root_nodes
def _listdir(directory): # return listdir(directory) try: assert_allowed_to_read(directory) return [f for f in listdir(directory) if not _is_hidden(f) and allowed_to_read(path_join(directory, f))] except OSError, e: Messager.error("Error listing %s: %s" % (directory, e)) raise AnnotationCollectionNotFoundError(directory)
def whoami(): json_dic = {} try: json_dic['user'] = get_session().get('user') except KeyError: # TODO: Really send this message? Messager.error('Not logged in!', duration=3) return json_dic
def whoami(): json_dic = {} try: json_dic["user"] = get_session().get("user") except KeyError: # TODO: Really send this message? Messager.error("Not logged in!", duration=3) return json_dic
def getAnnObject2(collection,document): '''newest version of the getAnnObject methode''' try: from os.path import join as path_join from document import real_directory real_dir = real_directory(collection) except: real_dir=collection app_path = WORK_DIR + "/application/" ann = None full_name = collection + document full_name = full_name.replace("/","") if( isfile(app_path+full_name)): temp=open (app_path+full_name , 'rb') ann = pickle_load(temp) temp.close() else: ann = TextAnnotations(real_dir+document) ann = SimpleAnnotations(ann) ann.folia = {} try: #TODO:good error message ann.folia=get_extra_info(collection,document) except Exception as e: ann.folia = {} Messager.error('Error: get extra folia info() failed: %s' % e) #Validation: try: import os import simplejson as json import session docdir = os.path.dirname(ann._document) string = session.load_conf()["config"] val = json.loads(string)["validationOn"] #validate if config enables it and if it's not already done. if val: if not ann.validated: from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = [] issues = verify_annotation(ann, projectconf) else: issues = ann.issues else: ann.validated = False issues = [] except session.NoSessionError: issues = [] except KeyError: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] ann.issues = issues temp=open (app_path+full_name , 'wb') pickle_dump(ann, temp) temp.close() return ann
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, "r") as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error("Error reading text file: nonstandard encoding or binary?", -1) raise UnableToReadTextFile(txt_file_path) j_dic["text"] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == "mecab": from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == "whitespace": from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == "ptblike": from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning("Unrecognized tokenisation option " ", reverting to whitespace tokenisation.") from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic["token_offsets"] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == "newline": from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == "regex": from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning("Unrecognized sentence splitting option " ", reverting to newline sentence splitting.") from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic["sentence_offsets"] = [o for o in ss_offset_gen(text)] return True
def __set_db_measure(db, measure): try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError ss_measure_by_str = {"cosine": simstring.cosine, "overlap": simstring.overlap} db.measure = ss_measure_by_str[measure]
def __import_simstring(): try: import simstring except ImportError: Messager.error('''Error: failed to import the simstring library. This library is required for approximate string matching DB lookup. Please install simstring and its Python bindings from http://www.chokkan.org/software/simstring/''', duration=-1) raise NoSimStringError
def _safe_serve(params, client_ip, client_hostname, cookie_data): # Note: Only logging imports here from config import WORK_DIR from logging import basicConfig as log_basic_config # Enable logging try: from config import LOG_LEVEL log_level = _convert_log_level(LOG_LEVEL) except ImportError: from logging import WARNING as LOG_LEVEL_WARNING log_level = LOG_LEVEL_WARNING log_basic_config(filename=path_join(WORK_DIR, 'server.log'), level=log_level) # Do the necessary imports after enabling the logging, order critical try: from common import ProtocolError, ProtocolArgumentError, NoPrintJSONError from dispatch import dispatch from jsonwrap import dumps from message import Messager from session import get_session, init_session, close_session, NoSessionError, SessionStoreError except ImportError: # Note: Heisenbug trap for #612, remove after resolved from logging import critical as log_critical from sys import path as sys_path log_critical('Heisenbug trap reports: ' + str(sys_path)) raise init_session(client_ip, cookie_data=cookie_data) response_is_JSON = True try: # Unpack the arguments into something less obscure than the # Python FieldStorage object (part dictonary, part list, part FUBAR) http_args = DefaultNoneDict() for k in params: # Also take the opportunity to convert Strings into Unicode, # according to HTTP they should be UTF-8 try: http_args[k] = unicode(params.getvalue(k), encoding='utf-8') except TypeError: Messager.error( 'protocol argument error: expected string argument %s, got %s' % (k, type(params.getvalue(k)))) raise ProtocolArgumentError # Dispatch the request json_dic = dispatch(http_args, client_ip, client_hostname) except ProtocolError, e: # Internal error, only reported to client not to log json_dic = {} e.json(json_dic) # Add a human-readable version of the error err_str = str(e) if err_str != '': Messager.error(err_str, duration=-1)
def _listdir(directory): # return listdir(directory) try: assert_allowed_to_read(directory) return [f for f in listdir(directory) if not _is_hidden(f) and allowed_to_read(path_join(directory, f))] except OSError as e: Messager.error("Error listing %s: %s" % (directory, e)) raise AnnotationCollectionNotFoundError(directory)
def __init__(self, document, read_only=False): #TODO: DOC! #TODO: Incorparate file locking! Is the destructor called upon inter crash? from collections import defaultdict from os.path import basename, getmtime, getctime #from fileinput import FileInput, hook_encoded # we should remember this self._document = document self.failed_lines = [] ### Here be dragons, these objects need constant updating and syncing # Annotation for each line of the file self._lines = [] # Mapping between annotation objects and which line they occur on # Range: [0, inf.) unlike [1, inf.) which is common for files self._line_by_ann = {} # Maximum id number used for each id prefix, to speed up id generation #XXX: This is effectively broken by the introduction of id suffixes self._max_id_num_by_prefix = defaultdict(lambda : 1) # Annotation by id, not includid non-ided annotations self._ann_by_id = {} ### ## We use some heuristics to find the appropriate annotation files self._read_only = read_only input_files = self._select_input_files(document) if not input_files: raise AnnotationFileNotFoundError(document) # We then try to open the files we got using the heuristics #self._file_input = FileInput(openhook=hook_encoded('utf-8')) self._input_files = input_files # Finally, parse the given annotation file try: self._parse_ann_file() # Sanity checking that can only be done post-parse self._sanity() except UnicodeDecodeError: Messager.error('Encoding error reading annotation file: ' 'nonstandard encoding or binary?', -1) # TODO: more specific exception raise AnnotationFileNotFoundError(document) #XXX: Hack to get the timestamps after parsing if (len(self._input_files) == 1 and self._input_files[0].endswith(JOINED_ANN_FILE_SUFF)): self.ann_mtime = getmtime(self._input_files[0]) self.ann_ctime = getctime(self._input_files[0]) else: # We don't have a single file, just set to epoch for now self.ann_mtime = 0 self.ann_ctime = 0
def _text_for_offsets(text, offsets): """ Given a text and a list of (start, end) integer offsets, returns the (catenated) text corresponding to those offsets. """ try: return "".join([text[s:e] for s,e in offsets]) except Exception: Messager.error('_text_for_offsets: failed to get text for given offsets (%s)' % str(offsets)) raise ProtocolArgumentError
def update_dump(j_dic,file_path): app_path = WORK_DIR + "/application/" temp_paths = file_path.split("/data/") try: full_name = temp_paths[1].replace("/","") temp=open (app_path+full_name , 'wb') pickle_dump(j_dic, temp) temp.close() except Exception as e: Messager.error("Error while caching changes in the annotation file: "+str(e))
def search_anns_for_event(ann_objs, trigger_text, args, restrict_types=[], ignore_types=[]): """ Searches the given Annotations objects for Event annotations matching the given specification. Returns a SearchMatchSet object. """ # treat None and empty list uniformly restrict_types = [] if restrict_types is None else restrict_types ignore_types = [] if ignore_types is None else ignore_types # TODO: include args in description description = "Event triggered by text containing '%s'" % trigger_text if restrict_types != []: description = description + ' (of type %s)' % (",".join(restrict_types)) matches = SearchMatchSet(description) for ann_obj in ann_objs: # collect per-document (ann_obj) for sorting ann_matches = [] for e in ann_obj.get_events(): if e.type in ignore_types: continue if restrict_types != [] and e.type not in restrict_types: continue try: t_ann = ann_obj.get_ann_by_id(e.trigger) except: # TODO: specific exception Messager.error('Failed to retrieve trigger annotation %s, skipping event %s in search' % (e.trigger, e.id)) # TODO: make options for "text included" vs. "text matches" # TODO: remove temporary hack giving special status to "*" if (trigger_text != None and trigger_text != "" and trigger_text != "*" and trigger_text not in t_ann.text): continue # TODO: argument constraints if len(args) != 0: Messager.warning('NOTE: ignoring event argument constraints in search (not implemented yet, sorry!)') ann_matches.append((t_ann, e)) # sort by trigger start offset ann_matches.sort(lambda a,b: cmp((a[0].start,-a[0].end),(b[0].start,-b[0].end))) # add to overall collection for t_obj, e in ann_matches: matches.add_match(ann_obj, e) # sort by document name for output matches.sort_matches() return matches
def whoami(): json_dic = {} try: if USER_PASSWORD != False: json_dic['user'] = get_session().get('user') else: json_dic['anonymous'] = True except KeyError: # TODO: Really send this message? Messager.error('Not logged in!', duration=3) return json_dic
def _text_for_offsets(text, offsets): """Given a text and a list of (start, end) integer offsets, returns the (catenated) text corresponding to those offsets, joined appropriately for use in a TextBoundAnnotation(WithText).""" try: return DISCONT_SEP.join(text[s:e] for s, e in offsets) except Exception: Messager.error( '_text_for_offsets: failed to get text for given offsets (%s)' % str(offsets)) raise ProtocolArgumentError
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target, undo_resp={}): attributes = _parse_attributes(attributes) if old_type is not None or old_target is not None: assert type in projectconf.get_relation_types(), ( ('attempting to convert relation to non-relation "%s" ' % (target.type, )) + ('(legit types: %s)' % (unicode(projectconf.get_relation_types()), ))) sought_target = (old_target if old_target is not None else target.id) sought_type = (old_type if old_type is not None else type) # We are to change the type, target, and/or attributes found = None for ann in ann_obj.get_relations(): if ann.arg2 == sought_target and ann.type == sought_type: found = ann break if found is None: # TODO: better response Messager.error('_create_relation: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target))) elif found.arg2 == target.id and found.type != type: # no changes to type or target pass else: # type and/or target changed, mark. before = unicode(found) found.arg2 = target.id found.type = type mods.change(before, found) target_ann = found else: # Create a new annotation new_id = ann_obj.get_new_id('R') rel = projectconf.get_relation_by_type(type) assert rel is not None and len(rel.arg_list) == 2 a1l, a2l = rel.arg_list ann = BinaryRelationAnnotation(new_id, type, a1l, origin.id, a2l, target.id, '\t') mods.addition(ann) ann_obj.add_annotation(ann) target_ann = ann # process attributes if target_ann is not None: _set_attributes(ann_obj, ann, attributes, mods, undo_resp) elif attributes != None: Messager.error('_create_relation: cannot set arguments: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target))) return target_ann
def update_dump(j_dic, file_path): app_path = WORK_DIR + "/application/" temp_paths = file_path.split("/data/") try: full_name = temp_paths[1].replace("/", "") temp = open(app_path + full_name, 'wb') pickle_dump(j_dic, temp) temp.close() except Exception as e: Messager.error("Error while caching changes in the annotation file: " + str(e))
def _text_for_offsets(text, offsets): """ Given a text and a list of (start, end) integer offsets, returns the (catenated) text corresponding to those offsets, joined appropriately for use in a TextBoundAnnotation(WithText). """ try: return DISCONT_SEP.join(text[s:e] for s,e in offsets) except Exception: Messager.error('_text_for_offsets: failed to get text for given offsets (%s)' % str(offsets)) raise ProtocolArgumentError
def _read_document_text(self, document): # TODO: this is too naive; document may be e.g. "PMID.a1", # in which case the reasonable text file name guess is # "PMID.txt", not "PMID.a1.txt" textfn = document+"."+TEXT_FILE_SUFFIX try: with open_textfile(textfn, 'r') as f: text = f.read() return text except: Messager.error('Error reading document text from %s' % textfn) return None
def jp_token_boundary_gen(text): try: from mecab import token_offsets_gen for o in token_offsets_gen(text): yield o except ImportError: from message import Messager Messager.error('Failed to import MeCab, ' 'falling back on whitespace tokenization. ' 'Please check configuration and/or server setup.') for o in whitespace_token_boundary_gen(text): yield o
def tag_file(directory, document): import os textfn = os.path.join(DATA_DIR, directory, document+'.txt') tagger_root = os.path.join(BASE_DIR, '../nlpwrap') tagger_cmd = os.path.join(tagger_root, 'tag-NERsuite.sh')+" "+textfn try: os.system(tagger_cmd) except Exception, e: Messager.error("Failed to run tagger. Please contact the administrator(s).", duration=-1) from sys import stderr print >> stderr, e return
def __set_db_measure(db, measure): try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError ss_measure_by_str = { 'cosine': simstring.cosine, 'overlap': simstring.overlap, } db.measure = ss_measure_by_str[measure]
def _json_offsets_to_list(offsets): try: offsets = json_loads(offsets) except Exception: Messager.error('create_span: protocol argument error: expected offsets as JSON, but failed to parse "%s"' % str(offsets)) raise ProtocolArgumentError try: offsets = [(int(s),int(e)) for s,e in offsets] except Exception: Messager.error('create_span: protocol argument error: expected offsets as list of int pairs, received "%s"' % str(offsets)) raise ProtocolArgumentError return offsets
def ssdb_supstring_lookup(s, dbname, threshold=DEFAULT_THRESHOLD, with_score=False): ''' Given a string s and a DB name, returns the strings in the associated simstring DB that likely contain s as an (approximate) substring. If with_score is True, returns pairs of (str,score) where score is the fraction of n-grams in s that are also found in the matched string. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError db = ssdb_open(dbname.encode('UTF-8')) __set_db_measure(db, 'overlap') db.threshold = threshold result = db.retrieve(s) db.close() # assume simstring DBs always contain UTF-8 - encoded strings result = [r.decode('UTF-8') for r in result] s = s.decode('UTF-8') # The simstring overlap measure is symmetric and thus does not # differentiate between substring and superstring matches. # Replicate a small bit of the simstring functionality (mostly the # ngrams() function) to filter to substrings only. s_ngrams = ngrams(s) filtered = [] for r in result: if s in r: # avoid calculation: simple containment => score=1 if with_score: filtered.append((r, 1.0)) else: filtered.append(r) else: r_ngrams = ngrams(r) overlap = s_ngrams & r_ngrams if len(overlap) >= len(s_ngrams) * threshold: if with_score: filtered.append((r, 1.0 * len(overlap) / len(s_ngrams))) else: filtered.append(r) return filtered
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' ' + unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info # First, generate tokenisation if JAPANESE: from tokenise import jp_token_boundary_gen token_offsets = [o for o in jp_token_boundary_gen(text)] else: from tokenise import en_token_boundary_gen token_offsets = [o for o in en_token_boundary_gen(text)] j_dic['token_offsets'] = token_offsets if NEWLINE_SS: from ssplit import newline_sentence_boundary_gen sentence_offsets = [o for o in newline_sentence_boundary_gen(text)] elif JAPANESE: from ssplit import jp_sentence_boundary_gen sentence_offsets = [o for o in jp_sentence_boundary_gen(text)] #log_info('offsets: ' + str(offsets)) else: from ssplit import en_sentence_boundary_gen sentence_offsets = [o for o in en_sentence_boundary_gen(text)] #log_info('offsets: ' + str(sentence_offsets)) j_dic['sentence_offsets'] = sentence_offsets return True
def _delete_arc_with_ann(origin, target, type_, mods, ann_obj, projectconf): origin_ann = ann_obj.get_ann_by_id(origin) # specifics of delete determined by arc type (equiv relation, # other relation, event argument) if projectconf.is_relation_type(type_): if projectconf.is_equiv_type(type_): _delete_arc_equiv(origin, target, type_, mods, ann_obj) else: _delete_arc_nonequiv_rel(origin, target, type_, mods, ann_obj) elif projectconf.is_event_type(origin_ann.type): _delete_arc_event_arg(origin, target, type_, mods, ann_obj) else: Messager.error('Unknown annotation types for delete')
def getAnnObject(collection, document): try: real_dir = real_directory(collection) except: real_dir = collection app_path = WORK_DIR + "/application/" full_name = collection + document full_name = full_name.replace("/", "") if (os.path.isfile(app_path + full_name)): temp = open(app_path + full_name, 'rb') ann = pickle_load(temp) temp.close() else: ann = TextAnnotations(real_dir + document) ann = SimpleAnnotations(ann) ann.folia = {} try: #TODO:good error message ann.folia = get_extra_info(collection, document) except Exception as e: ann.folia = {} Messager.error('Error: get extra folia info() failed: %s' % e) #Validation: try: docdir = os.path.dirname(ann._document) string = session.load_conf()["config"] val = json.loads(string)["validationOn"] #validate if config enables it and if it's not already done. if val: if not ann.validated: projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann, projectconf) else: issues = ann.issues else: ann.validated = False issues = [] except session.NoSessionError: issues = [] except KeyError: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] Messager.error('Error: validation failed: %s' % e) ann.issues = issues temp = open(app_path + full_name, 'wb') pickle_dump(ann, temp) temp.close() return ann
def validate_annotation(ann, projectconf): ''' Added by Sander Naert Will check an annotation file on user defined rules. ''' from message import Messager from validation_rule import ValidationRules issues = [] try: vrules = ValidationRules(projectconf) issues = vrules.validate(ann)[0] except Exception as e: Messager.error("Error: validating annotations: " + str(e)) pass return issues
def _permission_check(): from os import access, R_OK, W_OK from config import DATA_DIR, WORK_DIR from jsonwrap import dumps from message import Messager if not access(WORK_DIR, R_OK | W_OK): Messager.error((('Work dir: "%s" is not read-able and ' % WORK_DIR) + 'write-able by the server'), duration=-1) raise PermissionError if not access(DATA_DIR, R_OK): Messager.error((('Data dir: "%s" is not read-able ' % DATA_DIR) + 'by the server'), duration=-1) raise PermissionError
def _json_offsets_to_list(offsets): try: offsets = json_loads(offsets) except Exception: Messager.error( 'create_span: protocol argument error: expected offsets as JSON, but failed to parse "%s"' % str(offsets)) raise ProtocolArgumentError try: offsets = [(int(s), int(e)) for s, e in offsets] except Exception: Messager.error( 'create_span: protocol argument error: expected offsets as list of int pairs, received "%s"' % str(offsets)) raise ProtocolArgumentError return offsets
def ssdb_open(dbname): ''' Given a DB name, opens it as a simstring DB and returns the handle. The caller is responsible for invoking close() on the handle. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError try: return simstring.reader(__ssdb_path(dbname)) except IOError: Messager.error('Failed to open simstring DB %s' % dbname) raise ssdbNotFoundError(dbname)