Example #1
0
def possible_arc_types(collection, origin_type, target_type):
    directory = collection

    real_dir = real_directory(directory)
    projectconf = ProjectConfiguration(real_dir)
    response = {}

    try:
        possible = projectconf.arc_types_from_to(origin_type, target_type)

        # TODO: proper error handling
        if possible is None:
            Messager.error('Error selecting arc types!', -1)
        elif possible == []:
            # nothing to select
            response['html'] = generate_empty_fieldset()
            response['keymap'] = {}
            response['empty'] = True
        else:
            # XXX TODO: intentionally breaking this; KB shortcuts
            # should no longer be sent here. Remove 'keymap' and
            # 'html' args once clientside generation done.
            arc_kb_shortcuts = {} #select_keyboard_shortcuts(possible)

            response['keymap'] = {}
            for k, p in arc_kb_shortcuts.items():
                response['keymap'][k] = "arc_"+p

            response['html']  = generate_arc_type_html(projectconf, possible, arc_kb_shortcuts)
    except:
        Messager.error('Error selecting arc types!', -1)
        raise

    return response
Example #2
0
def _server_crash(cookie_hdrs, e):
    from config import ADMIN_CONTACT_EMAIL, DEBUG
    from jsonwrap import dumps
    from message import Messager

    stack_trace = _get_stack_trace()

    if DEBUG:
        # Send back the stack-trace as json
        error_msg = '\n'.join(('Server Python crash, stack-trace is:\n',
                               stack_trace))
        Messager.error(error_msg, duration=-1)
    else:
        # Give the user an error message
        # Use the current time since epoch as an id for later log look-up
        error_msg = ('The server encountered a serious error, '
                     'please contact the administrators at %s '
                     'and give the id #%d'
                     ) % (ADMIN_CONTACT_EMAIL, int(time()))
        Messager.error(error_msg, duration=-1)

    # Print to stderr so that the exception is logged by the webserver
    print(stack_trace, file=stderr)

    json_dic = {
        'exception': 'serverCrash',
    }
    return (cookie_hdrs, ((JSON_HDR, ), dumps(Messager.output_json(json_dic))))
Example #3
0
def _server_crash(cookie_hdrs, e):
    from config import ADMIN_CONTACT_EMAIL, DEBUG
    from jsonwrap import dumps
    from message import Messager

    stack_trace = _get_stack_trace()

    if DEBUG:
        # Send back the stack-trace as json
        error_msg = '\n'.join(('Server Python crash, stack-trace is:\n',
                               stack_trace))
        Messager.error(error_msg, duration=-1)
    else:
        # Give the user an error message
        # Use the current time since epoch as an id for later log look-up
        error_msg = ('The server encountered a serious error, '
                     'please contact the administrators at %s '
                     'and give the id #%d'
                     ) % (ADMIN_CONTACT_EMAIL, int(time()))
        Messager.error(error_msg, duration=-1)

    # Print to stderr so that the exception is logged by the webserver
    print(stack_trace, file=sys.stderr)

    json_dic = {
        'exception': 'serverCrash',
    }
    return (cookie_hdrs, ((JSON_HDR, ), dumps(Messager.output_json(json_dic))))
def ann_logger(directory):
    """
    Lazy initializer for the annotation logger. Returns None if
    annotation logging is not configured for the given directory and a
    logger otherwise.
    """
    if ann_logger.__logger == False:
        # not initialized
        annlogfile = options_get_annlogfile(directory)
        if annlogfile == '<NONE>':
            # not configured
            ann_logger.__logger = None
        else:
            # initialize
            try:
                l = logging.getLogger('annotation')
                l.setLevel(logging.INFO)
                handler = logging.FileHandler(annlogfile)
                handler.setLevel(logging.INFO)
                formatter = logging.Formatter('%(asctime)s\t%(message)s')
                handler.setFormatter(formatter)
                l.addHandler(handler)
                ann_logger.__logger = l
            except IOError, e:
                Messager.error("""Error: failed to initialize annotation log %s: %s.
Edit action not logged.
Please check the Annotation-log logfile setting in tools.conf""" % (annlogfile, e))
                logging.error("Failed to initialize annotation log %s: %s" % 
                              (annlogfile, e))
                ann_logger.__logger = None                
Example #5
0
def ann_logger(directory):
    """
    Lazy initializer for the annotation logger. Returns None if
    annotation logging is not configured for the given directory and a
    logger otherwise.
    """
    if ann_logger.__logger == False:
        # not initialized
        annlogfile = options_get_annlogfile(directory)
        if annlogfile == '<NONE>':
            # not configured
            ann_logger.__logger = None
        else:
            # initialize
            try:
                l = logging.getLogger('annotation')
                l.setLevel(logging.INFO)
                handler = logging.FileHandler(annlogfile)
                handler.setLevel(logging.INFO)
                formatter = logging.Formatter('%(asctime)s\t%(message)s')
                handler.setFormatter(formatter)
                l.addHandler(handler)
                ann_logger.__logger = l
            except IOError as e:
                Messager.error(
                    """Error: failed to initialize annotation log %s: %s.
Edit action not logged.
Please check the Annotation-log logfile setting in tools.conf""" %
                    (annlogfile, e))
                logging.error("Failed to initialize annotation log %s: %s" %
                              (annlogfile, e))
                ann_logger.__logger = None

    return ann_logger.__logger
Example #6
0
def _config_check():
    from message import Messager

    from sys import path
    from copy import deepcopy
    from os.path import dirname
    # Reset the path to force config.py to be in the root (could be hacked
    #       using __init__.py, but we can be monkey-patched anyway)
    orig_path = deepcopy(path)
    # Can't you empty in O(1) instead of O(N)?
    while path:
        path.pop()
    path.append(path_join(abspath(dirname(__file__)), '../..'))
    # Check if we have a config, otherwise whine
    try:
        import config
        del config
    except ImportError, e:
        path.extend(orig_path)
        # "Prettiest" way to check specific failure
        if e.message == 'No module named config':
            Messager.error(_miss_config_msg(), duration=-1)
        else:
            Messager.error(_get_stack_trace(), duration=-1)
        raise ConfigurationError
Example #7
0
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS):
    """
    Given a list of strings, a DB name, and simstring options, builds
    a simstring DB for the strings.
    """
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    dbfn = __ssdb_path(dbname)
    try:
        # only library defaults (n=3, no marks) supported just now (TODO)
        assert ngram_length == 3, "Error: unsupported n-gram length"
        assert include_marks == False, "Error: begin/end marks not supported"
        db = simstring.writer(dbfn)
        for s in strs:
            db.insert(s)
        db.close()
    except:
        print >> sys.stderr, "Error building simstring DB"
        raise

    return dbfn
Example #8
0
def ssdb_supstring_exists(s, dbname, threshold=DEFAULT_THRESHOLD):
    """Given a string s and a DB name, returns whether at least one string in
    the associated simstring DB likely contains s as an (approximate)
    substring."""
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    if threshold == 1.0:
        # optimized (not hugely, though) for this common case
        db = ssdb_open(dbname.encode('UTF-8'))

        __set_db_measure(db, 'overlap')
        db.threshold = threshold

        result = db.retrieve(s)
        db.close()

        # assume simstring DBs always contain UTF-8 - encoded strings
        result = [r.decode('UTF-8') for r in result]

        for r in result:
            if s in r:
                return True
        return False
    else:
        # naive implementation for everything else
        return len(ssdb_supstring_lookup(s, dbname, threshold)) != 0
Example #9
0
 def set_Ann_state(self, directory, file, state):
     real_dir = real_directory(directory)
     assert_allowed_to_read(real_dir)
     # check and update
     try:
         cursor = self.conn.cursor()
         cursor.execute("""BEGIN TRANSACTION""")
         cursor.execute(
             """SELECT userName FROM Ann WHERE fileDirAbs = ? and  fileName = ?;""",
             (directory, file))
         rows = cursor.fetchall()
         if len(rows) == 0:
             cursor.execute(
                 """UPDATE Ann SET state = ? WHERE fileDirAbs = ? and  fileName = ?;""",
                 (state, directory, file))
     except sqlite3.Error as e:
         # print("Database error: %s" % e, file=sys.stderr)
         Messager.error("Database error: %s" % e)
         self.conn.rollback()
     except Exception as e:
         # print("Exception in _query: %s" % e, file=sys.stderr)
         Messager.error("Exception in _query: %s" % e)
         self.conn.rollback()
     finally:
         cursor.execute("COMMIT")
         cursor.close()
def reverse_arc(collection, document, origin, target, type, attributes=None):
    directory = collection
    #undo_resp = {} # TODO
    real_dir = real_directory(directory)
    #mods = ModificationTracker() # TODO
    projectconf = ProjectConfiguration(real_dir)
    document = path_join(real_dir, document)
    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only 
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        if projectconf.is_equiv_type(type):
            Messager.warning('Cannot reverse Equiv arc')
        elif not projectconf.is_relation_type(type):
            Messager.warning('Can only reverse configured binary relations')
        else:
            # OK to reverse
            found = None
            # TODO: more sensible lookup
            for ann in ann_obj.get_relations():
                if (ann.arg1 == origin and ann.arg2 == target and
                    ann.type == type):
                    found = ann
                    break
            if found is None:
                Messager.error('reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type)))
            else:
                # found it; just adjust this
                found.arg1, found.arg2 = found.arg2, found.arg1
                # TODO: modification tracker

        json_response = {}
        json_response['annotations'] = _json_from_ann(ann_obj)
        return json_response
Example #11
0
    def __init__(self):
        # 连接到SQLite数据库
        # 数据库文件是DB_FNAME,如果文件不存在,会自动在当前目录创建
        flag_exist = os.path.isfile(DB_FNAME)
        self.conn = sqlite3.connect(DB_FNAME)

        if flag_exist:
            return None
        try:
            cursor = self.conn.cursor()
            cursor.execute(_CREATE_ANN_SQL)
            self.conn.commit()
        except sqlite3.Error as e:
            # print("Database error: %s" % e, file=sys.stderr)
            Messager.error("Database error: %s" % e)
            self.conn.rollback()
            self.conn.close()
        except Exception as e:
            # print("Exception in _query: %s" % e, file=sys.stderr)
            Messager.error("Exception in _query: %s" % e)
            self.conn.rollback()
            self.conn.close()
        finally:
            cursor.close()
            en_import_DATA = True
            if en_import_DATA:
                for dir in [
                        x[0].replace(DATA_DIR, '') + '/'
                        for x in os.walk(DATA_DIR)
                ]:
                    if len(dir) > 1:
                        self.import_files(dir)
            return None
Example #12
0
 def import_files(self, directory):
     real_dir = real_directory(directory)
     assert_allowed_to_read(real_dir)
     # Get the document names
     file_names = [
         fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')
     ]
     try:
         cursor = self.conn.cursor()
         for filename in file_names:
             state, fid, fileName, fileDirAbs, uid, userName = Ann_NULL, 0, filename, directory, 0, None
             cursor.execute(
                 _INSERT_ANN_SQL,
                 (state, fid, fileName, fileDirAbs, uid, userName))
         self.conn.commit()
     except sqlite3.Error as e:
         # print("Database error: %s" % e, file=sys.stderr)
         Messager.error("Database error: %s" % e)
         self.conn.rollback()
     except Exception as e:
         # print("Exception in _query: %s" % e, file=sys.stderr)
         Messager.error("Exception in _query: %s" % e)
         self.conn.rollback()
     finally:
         cursor.close()
Example #13
0
def ann_logger():
    """
    Lazy initializer for the annotation logger. Returns None if
    annotation logging is not configured and a logger otherwise.
    """
    if ann_logger.__logger == False:
        # not initialized
        if ANNOTATION_LOG is None:
            # not configured
            ann_logger.__logger = None
        else:
            # initialize
            try:
                l = logging.getLogger('annotation')
                l.setLevel(logging.INFO)
                handler = logging.FileHandler(ANNOTATION_LOG)
                handler.setLevel(logging.INFO)
                formatter = logging.Formatter('%(asctime)s\t%(message)s')
                handler.setFormatter(formatter)
                l.addHandler(handler)
                ann_logger.__logger = l
            except IOError, e:
                Messager.error(
                    """Error: failed to initialize annotation log %s: %s.
Edit action not logged.
Please check ANNOTATION_LOG setting in config.py""" % (ANNOTATION_LOG, e))
                logging.error("Failed to initialize annotation log %s: %s" %
                              (ANNOTATION_LOG, e))
                ann_logger.__logger = None
Example #14
0
def ssdb_build(strs,
               dbname,
               ngram_length=DEFAULT_NGRAM_LENGTH,
               include_marks=DEFAULT_INCLUDE_MARKS):
    '''
    Given a list of strings, a DB name, and simstring options, builds
    a simstring DB for the strings.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    dbfn = __ssdb_path(dbname)
    try:
        # only library defaults (n=3, no marks) supported just now (TODO)
        assert ngram_length == 3, "Error: unsupported n-gram length"
        assert include_marks == False, "Error: begin/end marks not supported"
        db = simstring.writer(dbfn)
        for s in strs:
            db.insert(s)
        db.close()
    except:
        print >> sys.stderr, "Error building simstring DB"
        raise

    return dbfn
Example #15
0
def _get_match_regex(text, text_match="word", match_case=False,
                     whole_string=False):
    """
    Helper for the various search_anns_for_ functions.
    """
    if match_case:
        regex_flags = 0
    else:
        regex_flags = re.IGNORECASE

    if text is None:
        text = ''

    if text_match == "word":
        # full word match: require word boundaries or, optionally,
        # whole string boundaries
        if whole_string:
            return re.compile(r'^'+re.escape(text)+r'$', regex_flags)
        else:
            return re.compile(r'\b'+re.escape(text)+r'\b', regex_flags)
    elif text_match == "substring":
        # any substring match, as text (nonoverlapping matches)
        return re.compile(re.escape(text), regex_flags)
    elif text_match == "regex":
        try:
            return re.compile(text, regex_flags)
        except: # whatever (sre_constants.error, other?)
            Messager.warning('Given string "%s" is not a valid regular expression.' % text)
            return None        
    else:
        Messager.error('Unrecognized search match specification "%s"' % text_match)
        return None    
Example #16
0
def ssdb_supstring_exists(s, dbname, threshold=DEFAULT_THRESHOLD):
    '''
    Given a string s and a DB name, returns whether at least one
    string in the associated simstring DB likely contains s as an
    (approximate) substring.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    if threshold == 1.0:
        # optimized (not hugely, though) for this common case
        db = ssdb_open(dbname.encode('UTF-8'))

        __set_db_measure(db, 'overlap')
        db.threshold = threshold

        result = db.retrieve(s)
        db.close()

        # assume simstring DBs always contain UTF-8 - encoded strings
        result = [r.decode('UTF-8') for r in result]
        s = s.decode('UTF-8')

        for r in result:
            if s in r:
                return True
        return False
    else:
        # naive implementation for everything else
        return len(ssdb_supstring_lookup(s, dbname, threshold)) != 0
Example #17
0
def _config_check():
    from message import Messager
    
    from sys import path
    from copy import deepcopy
    from os.path import dirname
    # Reset the path to force config.py to be in the root (could be hacked
    #       using __init__.py, but we can be monkey-patched anyway)
    orig_path = deepcopy(path)
    # Can't you empty in O(1) instead of O(N)?
    while path:
        path.pop()
    path.append(path_join(abspath(dirname(__file__)), '../..'))
    # Check if we have a config, otherwise whine
    try:
        import config
        del config
    except ImportError, e:
        path.extend(orig_path)
        # "Prettiest" way to check specific failure
        if e.message == 'No module named config':
            Messager.error(_miss_config_msg(), duration=-1)
        else:
            Messager.error(_get_stack_trace(), duration=-1)
        raise ConfigurationError
Example #18
0
def retrieve_stored(document, suffix):
    stored_path = _stored_path()+'.'+suffix

    if not isfile(stored_path):
        # @ninjin: not sure what 'version' was supposed to be returned
        # here, but none was defined, so returning that
#         raise NoSVGError(version)
        raise NoSVGError('None')

    filename = document+'.'+suffix

    # sorry, quick hack to get the content-type right
    # TODO: send this with initial 'stored' response instead of
    # guessing on suffix
    if suffix == SVG_SUFFIX:
        content_type = 'image/svg+xml'
    elif suffix == PNG_SUFFIX:
        content_type = 'image/png'
    elif suffix == PDF_SUFFIX:
        content_type = 'application/pdf'
    elif suffix == EPS_SUFFIX:
        content_type = 'application/postscript'
    else:
        Messager.error('Unknown suffix "%s"; cannot determine Content-Type' % suffix)
        # TODO: reasonable backoff value
        content_type = None

    # Bail out with a hack since we violated the protocol
    hdrs = [('Content-Type', content_type),
            ('Content-Disposition', 'inline; filename=' + filename)]

    with open(stored_path, 'rb') as stored_file:
        data = stored_file.read()

    raise NoPrintJSONError(hdrs, data)
Example #19
0
def ann_logger():
    """
    Lazy initializer for the annotation logger. Returns None if
    annotation logging is not configured and a logger otherwise.
    """
    if ann_logger.__logger == False:
        # not initialized
        if ANNOTATION_LOG is None:
            # not configured
            ann_logger.__logger = None
        else:
            # initialize
            try:
                l = logging.getLogger('annotation')
                l.setLevel(logging.INFO)
                handler = logging.FileHandler(ANNOTATION_LOG)
                handler.setLevel(logging.INFO)
                formatter = logging.Formatter('%(asctime)s\t%(message)s')
                handler.setFormatter(formatter)
                l.addHandler(handler)
                ann_logger.__logger = l
            except IOError, e:
                Messager.error("""Error: failed to initialize annotation log %s: %s.
Edit action not logged.
Please check ANNOTATION_LOG setting in config.py""" % (ANNOTATION_LOG, e))
                logging.error("Failed to initialize annotation log %s: %s" % 
                              (ANNOTATION_LOG, e))
                ann_logger.__logger = None                
Example #20
0
    def _parse_relation_annotation(self, id, data, data_tail, input_file_path):
        try:
            type_delim = data.index(' ')
            type, type_tail = (data[:type_delim], data[type_delim:])
        except ValueError:
            # cannot have a relation with just a type (contra event)
            raise IdedAnnotationLineSyntaxError(id, self.ann_line, self.ann_line_num+1, input_file_path)
            
        try:
            args = [tuple(arg.split(':')) for arg in type_tail.split()]
        except ValueError:
            raise IdedAnnotationLineSyntaxError(id, self.ann_line, self.ann_line_num+1, input_file_path)

        if len(args) != 2:
            Messager.error('Error parsing relation: must have exactly two arguments')
            raise IdedAnnotationLineSyntaxError(id, self.ann_line, self.ann_line_num+1, input_file_path)

        args.sort()
        if args[0][0] == args[1][0]:
            Messager.error('Error parsing relation: arguments must not be identical')
            raise IdedAnnotationLineSyntaxError(id, self.ann_line, self.ann_line_num+1, input_file_path)

        return BinaryRelationAnnotation(id, type,
                                        args[0][0], args[0][1],
                                        args[1][0], args[1][1],
                                        data_tail, source_id=input_file_path)
Example #21
0
def possible_arc_types(collection, origin_type, target_type):
    directory = collection

    real_dir = real_directory(directory)
    projectconf = ProjectConfiguration(real_dir)
    response = {}

    try:
        possible = projectconf.arc_types_from_to(origin_type, target_type)

        # TODO: proper error handling
        if possible is None:
            Messager.error('Error selecting arc types!', -1)
        elif possible == []:
            # nothing to select
            response['html'] = generate_empty_fieldset()
            response['keymap'] = {}
            response['empty'] = True
        else:
            # XXX TODO: intentionally breaking this; KB shortcuts
            # should no longer be sent here. Remove 'keymap' and
            # 'html' args once clientside generation done.
            arc_kb_shortcuts = {} #select_keyboard_shortcuts(possible)

            response['keymap'] = {}
            for k, p in arc_kb_shortcuts.items():
                response['keymap'][k] = "arc_"+p

            response['html']  = generate_arc_type_html(projectconf, possible, arc_kb_shortcuts)
    except:
        Messager.error('Error selecting arc types!', -1)
        raise

    return response
Example #22
0
File: auth.py Project: WeSIG/Delta
def allowed_to_read(real_path):
    data_path = path_join('/', relpath(real_path, DATA_DIR))
    # add trailing slash to directories, required to comply to robots.txt
    if isdir(real_path):
        data_path = '%s/' % (data_path)

    real_dir = dirname(real_path)
    robotparser = ProjectConfiguration(real_dir).get_access_control()
    if robotparser is None:
        return True  # default allow

    # 目录读取权限
    try:
        user = get_session().get('user')
        if user is None:
            Messager.error('没有登录!', duration=3)
            user = '******'
    except KeyError:
        Messager.error('没有登录!', duration=3)
        return False

    # print(user, file=sys.stderr)
    # display_message('Path: %s, dir: %s, user: %s, ' % (data_path, real_dir, user), type='error', duration=-1)
    # / tutorials /
    # / tutorials /
    # / tutorials / bio /
    # / tutorials / news /
    # / tutorials /
    # / tutorials / bio /
    # / tutorials / news /
    # print(data_path, file=sys.stderr)

    return robotparser.can_fetch(user, data_path)
def _create_relation(ann_obj, projectconf, mods, origin, target, type,
                     attributes, old_type, old_target, undo_resp={}):
    attributes = _parse_attributes(attributes)

    if old_type is not None or old_target is not None:
        assert type in projectconf.get_relation_types(), (
                ('attempting to convert relation to non-relation "%s" ' % (target.type, )) +
                ('(legit types: %s)' % (unicode(projectconf.get_relation_types()), )))

        sought_target = (old_target
                if old_target is not None else target.id)
        sought_type = (old_type
                if old_type is not None else type)
        sought_origin = origin.id

        # We are to change the type, target, and/or attributes
        found = None
        for ann in ann_obj.get_relations():
            if (ann.arg1 == sought_origin and ann.arg2 == sought_target and 
                ann.type == sought_type):
                found = ann
                break

        if found is None:
            # TODO: better response
            Messager.error('_create_relation: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target)))
        elif found.arg2 == target.id and found.type == type:
            # no changes to type or target
            pass
        else:
            # type and/or target changed, mark.
            before = unicode(found)
            found.arg2 = target.id
            found.type = type
            mods.change(before, found)

        target_ann = found
    else:
        # Create a new annotation
        new_id = ann_obj.get_new_id('R')
        # TODO: do we need to support different relation arg labels
        # depending on participant types? This doesn't.         
        rels = projectconf.get_relations_by_type(type) 
        rel = rels[0] if rels else None
        assert rel is not None and len(rel.arg_list) == 2
        a1l, a2l = rel.arg_list
        ann = BinaryRelationAnnotation(new_id, type, a1l, origin.id, a2l, target.id, '\t')
        mods.addition(ann)
        ann_obj.add_annotation(ann)

        target_ann = ann

    # process attributes
    if target_ann is not None:
        _set_attributes(ann_obj, ann, attributes, mods, undo_resp)
    elif attributes != None:
        Messager.error('_create_relation: cannot set arguments: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target)))        

    return target_ann
Example #24
0
def filter_folia(ann_obj):
    forbidden_ann=[]
    response = {"entities":[],"comments":[],"relations":[],"attributes":[],"tokens":{}}
    try:
        import simplejson as json
        import session
        string = session.load_conf()["config"]
        val = json.loads(string)["foliaLayers"]
    except session.NoSessionError:
        val = []
    except KeyError:
        val = []
        pass
    except Exception as e:
        val = []
        Messager.error("Error while enabling/disabling folia layers: "+str(e))
        pass
    try:
        response["tokens"]=ann_obj.folia["tokens"]
    except KeyError as e:
        pass
    if val:
        removed = set()
        forbidden = set(i for i in val)
        result = []
        alternatives = "alter" in val
        try:
            if 'all' in val:
                response["tokens"]={}
                return response
            else:
                for i in ann_obj.folia["entities"]:
                    if not i[3] in forbidden and not ( i[4] and alternatives ):
                        result.append(i)
                    else:
                        removed.add(i[0])
                response["entities"] = result
                result = []
                for i in ann_obj.folia["relations"]:
                    if not i[3] in forbidden and not i[2][0][1] in removed and not i[2][1][1] in removed and not ( i[4] and alternatives ):
                        result.append(i)
                    else:
                        removed.add(i[0])
                response["relations"] = result
                result = []
                for i in ann_obj.folia["attributes"]:
                    if not i[2] in removed:
                        result.append(i)
                response["attributes"] = result
                result = []
                for i in ann_obj.folia["comments"]:
                    if not i[0] in removed:
                        result.append(i)
                response["comments"] = result
        except KeyError:
            pass
    else:
        response = ann_obj.folia
    return response
Example #25
0
def _safe_serve(params, client_ip, client_hostname, cookie_data):
    # Note: Only logging imports here
    from config import WORK_DIR
    from logging import basicConfig as log_basic_config

    # Enable logging
    try:
        from config import LOG_LEVEL

        log_level = _convert_log_level(LOG_LEVEL)
    except ImportError:
        from logging import WARNING as LOG_LEVEL_WARNING

        log_level = LOG_LEVEL_WARNING
    log_basic_config(filename=path_join(WORK_DIR, "server.log"), level=log_level)

    # Do the necessary imports after enabling the logging, order critical
    try:
        from common import ProtocolError, ProtocolArgumentError, NoPrintJSONError
        from dispatch import dispatch
        from jsonwrap import dumps
        from message import Messager
        from session import get_session, init_session, close_session, NoSessionError, SessionStoreError
    except ImportError:
        # Note: Heisenbug trap for #612, remove after resolved
        from logging import critical as log_critical
        from sys import path as sys_path

        log_critical("Heisenbug trap reports: " + str(sys_path))
        raise

    init_session(client_ip, cookie_data=cookie_data)
    response_is_JSON = True
    try:
        # Unpack the arguments into something less obscure than the
        #   Python FieldStorage object (part dictonary, part list, part FUBAR)
        http_args = DefaultNoneDict()
        for k in params:
            # Also take the opportunity to convert Strings into Unicode,
            #   according to HTTP they should be UTF-8
            try:
                http_args[k] = unicode(params.getvalue(k), encoding="utf-8")
            except TypeError:
                Messager.error(
                    "protocol argument error: expected string argument %s, got %s" % (k, type(params.getvalue(k)))
                )
                raise ProtocolArgumentError

        # Dispatch the request
        json_dic = dispatch(http_args, client_ip, client_hostname)
    except ProtocolError, e:
        # Internal error, only reported to client not to log
        json_dic = {}
        e.json(json_dic)

        # Add a human-readable version of the error
        err_str = str(e)
        if err_str != "":
            Messager.error(err_str, duration=-1)
Example #26
0
def __read_term_hierarchy(input):
    root_nodes    = []
    last_node_at_depth = {}

    macros = {}
    for l in input:
        # skip empties and lines starting with '#'
        if l.strip() == '' or re.match(r'^\s*#', l):
            continue

        # interpret lines of only hyphens as separators
        # for display
        if re.match(r'^\s*-+\s*$', l):
            # TODO: proper placeholder and placing
            root_nodes.append(SEPARATOR_STR)
            continue

        # interpret lines of the format <STR1>=STR2 as "macro"
        # definitions, defining <STR1> as a placeholder that should be
        # replaced with STR2 whevever it occurs.
        m = re.match(r'^<([a-zA-Z_-]+)>=\s*(.*?)\s*$', l)
        if m:
            name, value = m.groups()
            if name in reserved_macro_name:
                Messager.error("Cannot redefine <%s> in configuration, it is a reserved name." % name)
                # TODO: proper exception
                assert False
            else:
                macros["<%s>" % name] = value
            continue

        # macro expansion
        for n in macros:
            l = l.replace(n, macros[n])
        
        m = re.match(r'^(\s*)([^\t]+)(?:\t(.*))?$', l)
        assert m, "Error parsing line: '%s'" % l
        indent, terms, args = m.groups()
        terms = [t.strip() for t in terms.split("|") if t.strip() != ""]
        if args is None or args.strip() == "":
            args = []
        else:
            args = [a.strip() for a in args.split(",") if a.strip() != ""]

        # depth in the ontology corresponds to the number of
        # spaces in the initial indent.
        depth = len(indent)

        n = TypeHierarchyNode(terms, args)
        if depth == 0:
            # root level, no children assignments
            root_nodes.append(n)
        else:
            # assign as child of last node at the depth of the parent
            assert depth-1 in last_node_at_depth, "Error: no parent for '%s'" % l
            last_node_at_depth[depth-1].children.append(n)
        last_node_at_depth[depth] = n

    return root_nodes
Example #27
0
def _listdir(directory):
    # return listdir(directory)
    try:
        assert_allowed_to_read(directory)
        return [f for f in listdir(directory) if not _is_hidden(f) and allowed_to_read(path_join(directory, f))]
    except OSError, e:
        Messager.error("Error listing %s: %s" % (directory, e))
        raise AnnotationCollectionNotFoundError(directory)
def whoami():
    json_dic = {}
    try:
        json_dic['user'] = get_session().get('user')
    except KeyError:
        # TODO: Really send this message?
        Messager.error('Not logged in!', duration=3)
    return json_dic
Example #29
0
def whoami():
    json_dic = {}
    try:
        json_dic["user"] = get_session().get("user")
    except KeyError:
        # TODO: Really send this message?
        Messager.error("Not logged in!", duration=3)
    return json_dic
Example #30
0
def whoami():
    json_dic = {}
    try:
        json_dic['user'] = get_session().get('user')
    except KeyError:
        # TODO: Really send this message?
        Messager.error('Not logged in!', duration=3)
    return json_dic
Example #31
0
def getAnnObject2(collection,document):
    '''newest version of the getAnnObject methode'''
    try:
        from os.path import join as path_join
        from document import real_directory
        real_dir = real_directory(collection)
    except:
        real_dir=collection      
    app_path = WORK_DIR + "/application/"
    ann = None
    full_name = collection + document
    full_name = full_name.replace("/","")
    if( isfile(app_path+full_name)):
        temp=open (app_path+full_name , 'rb')
        ann = pickle_load(temp)
        temp.close()
    else:
        ann = TextAnnotations(real_dir+document)
        ann = SimpleAnnotations(ann)
        ann.folia = {}
        try:
            #TODO:good error message
            ann.folia=get_extra_info(collection,document)
        except Exception as e:
            ann.folia = {}
            Messager.error('Error: get extra folia info() failed: %s' % e)
    #Validation:
    try:
        import os
        import simplejson as json
        import session
        docdir = os.path.dirname(ann._document)
        string = session.load_conf()["config"]
        val = json.loads(string)["validationOn"]
        #validate if config enables it and if it's not already done.
        if val:
            if not ann.validated:    
                from verify_annotations import verify_annotation
                projectconf = ProjectConfiguration(docdir)
                issues = []
                issues = verify_annotation(ann, projectconf)
            else:
                issues = ann.issues
        else:
            ann.validated = False
            issues = []
    except session.NoSessionError:
        issues = []
    except KeyError:
        issues = []
    except Exception as e:
        # TODO add an issue about the failure?
        issues = []
    ann.issues = issues
    temp=open (app_path+full_name , 'wb')    
    pickle_dump(ann, temp)
    temp.close()
    return ann
Example #32
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, "r") as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error("Error reading text file: nonstandard encoding or binary?", -1)
            raise UnableToReadTextFile(txt_file_path)

    j_dic["text"] = text

    from logging import info as log_info

    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    if tokeniser == "mecab":
        from tokenise import jp_token_boundary_gen

        tok_offset_gen = jp_token_boundary_gen
    elif tokeniser == "whitespace":
        from tokenise import whitespace_token_boundary_gen

        tok_offset_gen = whitespace_token_boundary_gen
    elif tokeniser == "ptblike":
        from tokenise import gtb_token_boundary_gen

        tok_offset_gen = gtb_token_boundary_gen
    else:
        Messager.warning("Unrecognized tokenisation option " ", reverting to whitespace tokenisation.")
        from tokenise import whitespace_token_boundary_gen

        tok_offset_gen = whitespace_token_boundary_gen
    j_dic["token_offsets"] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == "newline":
        from ssplit import newline_sentence_boundary_gen

        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == "regex":
        from ssplit import regex_sentence_boundary_gen

        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning("Unrecognized sentence splitting option " ", reverting to newline sentence splitting.")
        from ssplit import newline_sentence_boundary_gen

        ss_offset_gen = newline_sentence_boundary_gen
    j_dic["sentence_offsets"] = [o for o in ss_offset_gen(text)]

    return True
Example #33
0
def __set_db_measure(db, measure):
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    ss_measure_by_str = {"cosine": simstring.cosine, "overlap": simstring.overlap}
    db.measure = ss_measure_by_str[measure]
Example #34
0
def __import_simstring():
    try:
        import simstring
    except ImportError:
        Messager.error('''Error: failed to import the simstring library.
        This library is required for approximate string matching DB lookup.
        Please install simstring and its Python bindings from
        http://www.chokkan.org/software/simstring/''', duration=-1)
        raise NoSimStringError
Example #35
0
def _safe_serve(params, client_ip, client_hostname, cookie_data):
    # Note: Only logging imports here
    from config import WORK_DIR
    from logging import basicConfig as log_basic_config

    # Enable logging
    try:
        from config import LOG_LEVEL
        log_level = _convert_log_level(LOG_LEVEL)
    except ImportError:
        from logging import WARNING as LOG_LEVEL_WARNING
        log_level = LOG_LEVEL_WARNING
    log_basic_config(filename=path_join(WORK_DIR, 'server.log'),
                     level=log_level)

    # Do the necessary imports after enabling the logging, order critical
    try:
        from common import ProtocolError, ProtocolArgumentError, NoPrintJSONError
        from dispatch import dispatch
        from jsonwrap import dumps
        from message import Messager
        from session import get_session, init_session, close_session, NoSessionError, SessionStoreError
    except ImportError:
        # Note: Heisenbug trap for #612, remove after resolved
        from logging import critical as log_critical
        from sys import path as sys_path
        log_critical('Heisenbug trap reports: ' + str(sys_path))
        raise

    init_session(client_ip, cookie_data=cookie_data)
    response_is_JSON = True
    try:
        # Unpack the arguments into something less obscure than the
        #   Python FieldStorage object (part dictonary, part list, part FUBAR)
        http_args = DefaultNoneDict()
        for k in params:
            # Also take the opportunity to convert Strings into Unicode,
            #   according to HTTP they should be UTF-8
            try:
                http_args[k] = unicode(params.getvalue(k), encoding='utf-8')
            except TypeError:
                Messager.error(
                    'protocol argument error: expected string argument %s, got %s'
                    % (k, type(params.getvalue(k))))
                raise ProtocolArgumentError

        # Dispatch the request
        json_dic = dispatch(http_args, client_ip, client_hostname)
    except ProtocolError, e:
        # Internal error, only reported to client not to log
        json_dic = {}
        e.json(json_dic)

        # Add a human-readable version of the error
        err_str = str(e)
        if err_str != '':
            Messager.error(err_str, duration=-1)
Example #36
0
def _listdir(directory):
    # return listdir(directory)
    try:
        assert_allowed_to_read(directory)
        return [f for f in listdir(directory) if not _is_hidden(f)
                and allowed_to_read(path_join(directory, f))]
    except OSError as e:
        Messager.error("Error listing %s: %s" % (directory, e))
        raise AnnotationCollectionNotFoundError(directory)
Example #37
0
    def __init__(self, document, read_only=False):
        #TODO: DOC!
        #TODO: Incorparate file locking! Is the destructor called upon inter crash?
        from collections import defaultdict
        from os.path import basename, getmtime, getctime
        #from fileinput import FileInput, hook_encoded

        # we should remember this
        self._document = document

        self.failed_lines = []

        ### Here be dragons, these objects need constant updating and syncing
        # Annotation for each line of the file
        self._lines = []
        # Mapping between annotation objects and which line they occur on
        # Range: [0, inf.) unlike [1, inf.) which is common for files
        self._line_by_ann = {}
        # Maximum id number used for each id prefix, to speed up id generation
        #XXX: This is effectively broken by the introduction of id suffixes
        self._max_id_num_by_prefix = defaultdict(lambda : 1)
        # Annotation by id, not includid non-ided annotations 
        self._ann_by_id = {}
        ###

        ## We use some heuristics to find the appropriate annotation files
        self._read_only = read_only
        input_files = self._select_input_files(document)

        if not input_files:
            raise AnnotationFileNotFoundError(document)

        # We then try to open the files we got using the heuristics
        #self._file_input = FileInput(openhook=hook_encoded('utf-8'))
        self._input_files = input_files

        # Finally, parse the given annotation file
        try:
            self._parse_ann_file()
        
            # Sanity checking that can only be done post-parse
            self._sanity()
        except UnicodeDecodeError:
            Messager.error('Encoding error reading annotation file: '
                    'nonstandard encoding or binary?', -1)
            # TODO: more specific exception
            raise AnnotationFileNotFoundError(document)

        #XXX: Hack to get the timestamps after parsing
        if (len(self._input_files) == 1 and
                self._input_files[0].endswith(JOINED_ANN_FILE_SUFF)):
            self.ann_mtime = getmtime(self._input_files[0])
            self.ann_ctime = getctime(self._input_files[0])
        else:
            # We don't have a single file, just set to epoch for now
            self.ann_mtime = 0
            self.ann_ctime = 0
Example #38
0
def _text_for_offsets(text, offsets):
    """
    Given a text and a list of (start, end) integer offsets, returns
    the (catenated) text corresponding to those offsets.
    """
    try:
        return "".join([text[s:e] for s,e in offsets])
    except Exception:
        Messager.error('_text_for_offsets: failed to get text for given offsets (%s)' % str(offsets))
        raise ProtocolArgumentError
Example #39
0
def update_dump(j_dic,file_path):
    app_path = WORK_DIR + "/application/"
    temp_paths = file_path.split("/data/")
    try:
        full_name = temp_paths[1].replace("/","")
        temp=open (app_path+full_name , 'wb')
        pickle_dump(j_dic, temp)
        temp.close()
    except Exception as e:
        Messager.error("Error while caching changes in the annotation file: "+str(e))
Example #40
0
File: search.py Project: dmcc/brat
def search_anns_for_event(ann_objs, trigger_text, args, restrict_types=[], ignore_types=[]):
    """
    Searches the given Annotations objects for Event annotations
    matching the given specification. Returns a SearchMatchSet object.
    """

    # treat None and empty list uniformly
    restrict_types = [] if restrict_types is None else restrict_types
    ignore_types   = [] if ignore_types is None else ignore_types

    # TODO: include args in description
    description = "Event triggered by text containing '%s'" % trigger_text
    if restrict_types != []:
        description = description + ' (of type %s)' % (",".join(restrict_types))
    matches = SearchMatchSet(description)
    
    for ann_obj in ann_objs:
        # collect per-document (ann_obj) for sorting
        ann_matches = []

        for e in ann_obj.get_events():
            if e.type in ignore_types:
                continue
            if restrict_types != [] and e.type not in restrict_types:
                continue

            try:
                t_ann = ann_obj.get_ann_by_id(e.trigger)
            except:
                # TODO: specific exception
                Messager.error('Failed to retrieve trigger annotation %s, skipping event %s in search' % (e.trigger, e.id))

            # TODO: make options for "text included" vs. "text matches"
            # TODO: remove temporary hack giving special status to "*"
            if (trigger_text != None and trigger_text != "" and 
                trigger_text != "*" and trigger_text not in t_ann.text):
                continue

            # TODO: argument constraints
            if len(args) != 0:
                Messager.warning('NOTE: ignoring event argument constraints in search (not implemented yet, sorry!)')

            ann_matches.append((t_ann, e))

        # sort by trigger start offset
        ann_matches.sort(lambda a,b: cmp((a[0].start,-a[0].end),(b[0].start,-b[0].end)))

        # add to overall collection
        for t_obj, e in ann_matches:
            matches.add_match(ann_obj, e)

    # sort by document name for output
    matches.sort_matches()

    return matches
Example #41
0
File: auth.py Project: jjon/brat
def whoami():
    json_dic = {}
    try:
        if USER_PASSWORD != False:
            json_dic['user'] = get_session().get('user')
        else:
            json_dic['anonymous'] = True
    except KeyError:
        # TODO: Really send this message?
        Messager.error('Not logged in!', duration=3)
    return json_dic
Example #42
0
def _text_for_offsets(text, offsets):
    """Given a text and a list of (start, end) integer offsets, returns the
    (catenated) text corresponding to those offsets, joined appropriately for
    use in a TextBoundAnnotation(WithText)."""
    try:
        return DISCONT_SEP.join(text[s:e] for s, e in offsets)
    except Exception:
        Messager.error(
            '_text_for_offsets: failed to get text for given offsets (%s)' %
            str(offsets))
        raise ProtocolArgumentError
Example #43
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    j_dic['text'] = text

    from logging import info as log_info

    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    if tokeniser == 'mecab':
        from tokenise import jp_token_boundary_gen
        tok_offset_gen = jp_token_boundary_gen
    elif tokeniser == 'whitespace':
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    elif tokeniser == 'ptblike':
        from tokenise import gtb_token_boundary_gen
        tok_offset_gen = gtb_token_boundary_gen
    else:
        Messager.warning('Unrecognized tokenisation option '
                         ', reverting to whitespace tokenisation.')
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    j_dic['token_offsets'] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == 'newline':
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == 'regex':
        from ssplit import regex_sentence_boundary_gen
        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning('Unrecognized sentence splitting option '
                         ', reverting to newline sentence splitting.')
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)]

    return True
Example #44
0
def _create_relation(ann_obj, projectconf, mods, origin, target, type,
                     attributes, old_type, old_target, undo_resp={}):
    attributes = _parse_attributes(attributes)

    if old_type is not None or old_target is not None:
        assert type in projectconf.get_relation_types(), (
                ('attempting to convert relation to non-relation "%s" ' % (target.type, )) +
                ('(legit types: %s)' % (unicode(projectconf.get_relation_types()), )))

        sought_target = (old_target
                if old_target is not None else target.id)
        sought_type = (old_type
                if old_type is not None else type)

        # We are to change the type, target, and/or attributes
        found = None
        for ann in ann_obj.get_relations():
            if ann.arg2 == sought_target and ann.type == sought_type:
                found = ann
                break

        if found is None:
            # TODO: better response
            Messager.error('_create_relation: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target)))
        elif found.arg2 == target.id and found.type != type:
            # no changes to type or target
            pass
        else:
            # type and/or target changed, mark.
            before = unicode(found)
            found.arg2 = target.id
            found.type = type
            mods.change(before, found)

        target_ann = found
    else:
        # Create a new annotation
        new_id = ann_obj.get_new_id('R')
        rel = projectconf.get_relation_by_type(type)
        assert rel is not None and len(rel.arg_list) == 2
        a1l, a2l = rel.arg_list
        ann = BinaryRelationAnnotation(new_id, type, a1l, origin.id, a2l, target.id, '\t')
        mods.addition(ann)
        ann_obj.add_annotation(ann)

        target_ann = ann

    # process attributes
    if target_ann is not None:
        _set_attributes(ann_obj, ann, attributes, mods, undo_resp)
    elif attributes != None:
        Messager.error('_create_relation: cannot set arguments: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target)))        

    return target_ann
Example #45
0
def update_dump(j_dic, file_path):
    app_path = WORK_DIR + "/application/"
    temp_paths = file_path.split("/data/")
    try:
        full_name = temp_paths[1].replace("/", "")
        temp = open(app_path + full_name, 'wb')
        pickle_dump(j_dic, temp)
        temp.close()
    except Exception as e:
        Messager.error("Error while caching changes in the annotation file: " +
                       str(e))
def _text_for_offsets(text, offsets):
    """
    Given a text and a list of (start, end) integer offsets, returns
    the (catenated) text corresponding to those offsets, joined
    appropriately for use in a TextBoundAnnotation(WithText).
    """
    try:
        return DISCONT_SEP.join(text[s:e] for s,e in offsets)
    except Exception:
        Messager.error('_text_for_offsets: failed to get text for given offsets (%s)' % str(offsets))
        raise ProtocolArgumentError
Example #47
0
 def _read_document_text(self, document):
     # TODO: this is too naive; document may be e.g. "PMID.a1",
     # in which case the reasonable text file name guess is
     # "PMID.txt", not "PMID.a1.txt"
     textfn = document+"."+TEXT_FILE_SUFFIX
     try:
         with open_textfile(textfn, 'r') as f:
             text = f.read()
             return text
     except:
         Messager.error('Error reading document text from %s' % textfn)
     return None
Example #48
0
def jp_token_boundary_gen(text):
    try:
        from mecab import token_offsets_gen
        for o in token_offsets_gen(text):
            yield o
    except ImportError:
        from message import Messager
        Messager.error('Failed to import MeCab, '
                       'falling back on whitespace tokenization. '
                       'Please check configuration and/or server setup.')
        for o in whitespace_token_boundary_gen(text):
            yield o
Example #49
0
File: tag.py Project: dmcc/brat
def tag_file(directory, document):
    import os
    textfn      = os.path.join(DATA_DIR, directory, document+'.txt')
    tagger_root = os.path.join(BASE_DIR, '../nlpwrap')
    tagger_cmd  = os.path.join(tagger_root, 'tag-NERsuite.sh')+" "+textfn
    try:
        os.system(tagger_cmd)
    except Exception, e:
        Messager.error("Failed to run tagger. Please contact the administrator(s).", duration=-1)
        from sys import stderr
        print >> stderr, e
        return
Example #50
0
def __set_db_measure(db, measure):
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    ss_measure_by_str = {
        'cosine': simstring.cosine,
        'overlap': simstring.overlap,
    }
    db.measure = ss_measure_by_str[measure]
def _json_offsets_to_list(offsets):
    try:
        offsets = json_loads(offsets)
    except Exception:
        Messager.error('create_span: protocol argument error: expected offsets as JSON, but failed to parse "%s"' % str(offsets))
        raise ProtocolArgumentError
    try:
        offsets = [(int(s),int(e)) for s,e in offsets]
    except Exception:
        Messager.error('create_span: protocol argument error: expected offsets as list of int pairs, received "%s"' % str(offsets))
        raise ProtocolArgumentError
    return offsets
Example #52
0
def ssdb_supstring_lookup(s,
                          dbname,
                          threshold=DEFAULT_THRESHOLD,
                          with_score=False):
    '''
    Given a string s and a DB name, returns the strings in the
    associated simstring DB that likely contain s as an (approximate)
    substring. If with_score is True, returns pairs of (str,score)
    where score is the fraction of n-grams in s that are also found in
    the matched string.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    db = ssdb_open(dbname.encode('UTF-8'))

    __set_db_measure(db, 'overlap')
    db.threshold = threshold

    result = db.retrieve(s)
    db.close()

    # assume simstring DBs always contain UTF-8 - encoded strings
    result = [r.decode('UTF-8') for r in result]
    s = s.decode('UTF-8')

    # The simstring overlap measure is symmetric and thus does not
    # differentiate between substring and superstring matches.
    # Replicate a small bit of the simstring functionality (mostly the
    # ngrams() function) to filter to substrings only.
    s_ngrams = ngrams(s)
    filtered = []
    for r in result:
        if s in r:
            # avoid calculation: simple containment => score=1
            if with_score:
                filtered.append((r, 1.0))
            else:
                filtered.append(r)
        else:
            r_ngrams = ngrams(r)
            overlap = s_ngrams & r_ngrams
            if len(overlap) >= len(s_ngrams) * threshold:
                if with_score:
                    filtered.append((r, 1.0 * len(overlap) / len(s_ngrams)))
                else:
                    filtered.append(r)

    return filtered
Example #53
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    # TODO XXX huge hack, sorry, the client currently crashing on
    # chrome for two or more consecutive space, so replace every
    # second with literal non-breaking space. Note that this is just
    # for the client display -- server-side storage is not affected.
    # NOTE: it might be possible to fix this in a principled way by
    # having xml:space="preserve" on the relevant elements.
    text = text.replace("  ", ' ' + unichr(0x00A0))

    j_dic['text'] = text

    from logging import info as log_info

    # First, generate tokenisation
    if JAPANESE:
        from tokenise import jp_token_boundary_gen
        token_offsets = [o for o in jp_token_boundary_gen(text)]
    else:
        from tokenise import en_token_boundary_gen
        token_offsets = [o for o in en_token_boundary_gen(text)]
    j_dic['token_offsets'] = token_offsets

    if NEWLINE_SS:
        from ssplit import newline_sentence_boundary_gen
        sentence_offsets = [o for o in newline_sentence_boundary_gen(text)]
    elif JAPANESE:
        from ssplit import jp_sentence_boundary_gen
        sentence_offsets = [o for o in jp_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(offsets))
    else:
        from ssplit import en_sentence_boundary_gen
        sentence_offsets = [o for o in en_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(sentence_offsets))
    j_dic['sentence_offsets'] = sentence_offsets

    return True
Example #54
0
def _delete_arc_with_ann(origin, target, type_, mods, ann_obj, projectconf):
    origin_ann = ann_obj.get_ann_by_id(origin)

    # specifics of delete determined by arc type (equiv relation,
    # other relation, event argument)
    if projectconf.is_relation_type(type_):
        if projectconf.is_equiv_type(type_):
            _delete_arc_equiv(origin, target, type_, mods, ann_obj)
        else:
            _delete_arc_nonequiv_rel(origin, target, type_, mods, ann_obj)
    elif projectconf.is_event_type(origin_ann.type):
        _delete_arc_event_arg(origin, target, type_, mods, ann_obj)
    else:
        Messager.error('Unknown annotation types for delete')
Example #55
0
def getAnnObject(collection, document):
    try:
        real_dir = real_directory(collection)
    except:
        real_dir = collection
    app_path = WORK_DIR + "/application/"
    full_name = collection + document
    full_name = full_name.replace("/", "")
    if (os.path.isfile(app_path + full_name)):
        temp = open(app_path + full_name, 'rb')
        ann = pickle_load(temp)
        temp.close()
    else:
        ann = TextAnnotations(real_dir + document)
        ann = SimpleAnnotations(ann)
        ann.folia = {}
        try:
            #TODO:good error message
            ann.folia = get_extra_info(collection, document)
        except Exception as e:
            ann.folia = {}
            Messager.error('Error: get extra folia info() failed: %s' % e)
    #Validation:
    try:
        docdir = os.path.dirname(ann._document)
        string = session.load_conf()["config"]
        val = json.loads(string)["validationOn"]
        #validate if config enables it and if it's not already done.
        if val:
            if not ann.validated:
                projectconf = ProjectConfiguration(docdir)
                issues = verify_annotation(ann, projectconf)
            else:
                issues = ann.issues
        else:
            ann.validated = False
            issues = []
    except session.NoSessionError:
        issues = []
    except KeyError:
        issues = []
    except Exception as e:
        # TODO add an issue about the failure?
        issues = []
        Messager.error('Error: validation failed: %s' % e)
    ann.issues = issues
    temp = open(app_path + full_name, 'wb')
    pickle_dump(ann, temp)
    temp.close()
    return ann
Example #56
0
def validate_annotation(ann, projectconf):
    '''
    Added by Sander Naert
    Will check an annotation file on user defined rules.
    '''
    from message import Messager
    from validation_rule import ValidationRules
    issues = []
    try:
        vrules = ValidationRules(projectconf)
        issues = vrules.validate(ann)[0]
    except Exception as e:
        Messager.error("Error: validating annotations: " + str(e))
        pass
    return issues
Example #57
0
def _permission_check():
    from os import access, R_OK, W_OK
    from config import DATA_DIR, WORK_DIR
    from jsonwrap import dumps
    from message import Messager

    if not access(WORK_DIR, R_OK | W_OK):
        Messager.error((('Work dir: "%s" is not read-able and ' % WORK_DIR) +
                        'write-able by the server'), duration=-1)
        raise PermissionError

    if not access(DATA_DIR, R_OK):
        Messager.error((('Data dir: "%s" is not read-able ' % DATA_DIR) +
                        'by the server'), duration=-1)
        raise PermissionError
Example #58
0
def _json_offsets_to_list(offsets):
    try:
        offsets = json_loads(offsets)
    except Exception:
        Messager.error(
            'create_span: protocol argument error: expected offsets as JSON, but failed to parse "%s"' %
            str(offsets))
        raise ProtocolArgumentError
    try:
        offsets = [(int(s), int(e)) for s, e in offsets]
    except Exception:
        Messager.error(
            'create_span: protocol argument error: expected offsets as list of int pairs, received "%s"' %
            str(offsets))
        raise ProtocolArgumentError
    return offsets
Example #59
0
def ssdb_open(dbname):
    '''
    Given a DB name, opens it as a simstring DB and returns the handle.
    The caller is responsible for invoking close() on the handle.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    try:
        return simstring.reader(__ssdb_path(dbname))
    except IOError:
        Messager.error('Failed to open simstring DB %s' % dbname)
        raise ssdbNotFoundError(dbname)