def get_base_types(directory): project_conf = ProjectConfiguration(directory) keymap = project_conf.get_kb_shortcuts() hotkey_by_type = dict((v, k) for k, v in keymap.iteritems()) # fill config for nodes for which annotation is configured # calculate once only (this can get heavy) all_connections = project_conf.all_connections() event_hierarchy = project_conf.get_event_type_hierarchy() event_types = _fill_type_configuration(event_hierarchy, project_conf, hotkey_by_type, all_connections) entity_hierarchy = project_conf.get_entity_type_hierarchy() entity_types = _fill_type_configuration(entity_hierarchy, project_conf, hotkey_by_type, all_connections) relation_hierarchy = project_conf.get_relation_type_hierarchy() relation_types = _fill_relation_configuration(relation_hierarchy, project_conf, hotkey_by_type) # make visual config available also for nodes for which there is # no annotation config. Note that defaults (SPAN_DEFAULT etc.) # are included via get_drawing_types() if defined. unconfigured = [ l for l in (project_conf.get_labels().keys() + project_conf.get_drawing_types()) if not project_conf.is_configured_type(l) ] unconf_types = _fill_visual_configuration(unconfigured, project_conf) return event_types, entity_types, relation_types, unconf_types
def allowed_to_read(real_path): data_path = path_join('/', relpath(real_path, DATA_DIR)) # add trailing slash to directories, required to comply to robots.txt if isdir(real_path): data_path = '%s/' % (data_path) real_dir = dirname(real_path) robotparser = ProjectConfiguration(real_dir).get_access_control() if robotparser is None: return True # default allow # 目录读取权限 try: user = get_session().get('user') if user is None: Messager.error('没有登录!', duration=3) user = '******' except KeyError: Messager.error('没有登录!', duration=3) return False # print(user, file=sys.stderr) # display_message('Path: %s, dir: %s, user: %s, ' % (data_path, real_dir, user), type='error', duration=-1) # / tutorials / # / tutorials / # / tutorials / bio / # / tutorials / news / # / tutorials / # / tutorials / bio / # / tutorials / news / # print(data_path, file=sys.stderr) return robotparser.can_fetch(user, data_path)
def reverse_arc(collection, document, origin, target, type, attributes=None): directory = collection #undo_resp = {} # TODO real_dir = real_directory(directory) #mods = ModificationTracker() # TODO projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) if projectconf.is_equiv_type(type): Messager.warning('Cannot reverse Equiv arc') elif not projectconf.is_relation_type(type): Messager.warning('Can only reverse configured binary relations') else: # OK to reverse found = None # TODO: more sensible lookup for ann in ann_obj.get_relations(): if (ann.arg1 == origin and ann.arg2 == target and ann.type == type): found = ann break if found is None: Messager.error('reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type))) else: # found it; just adjust this found.arg1, found.arg2 = found.arg2, found.arg1 # TODO: modification tracker json_response = {} json_response['annotations'] = _json_from_ann(ann_obj) return json_response
def possible_arc_types(collection, origin_type, target_type): directory = collection real_dir = real_directory(directory) projectconf = ProjectConfiguration(real_dir) response = {} try: possible = projectconf.arc_types_from_to(origin_type, target_type) # TODO: proper error handling if possible is None: Messager.error('Error selecting arc types!', -1) elif possible == []: # nothing to select response['html'] = generate_empty_fieldset() response['keymap'] = {} response['empty'] = True else: # XXX TODO: intentionally breaking this; KB shortcuts # should no longer be sent here. Remove 'keymap' and # 'html' args once clientside generation done. arc_kb_shortcuts = {} #select_keyboard_shortcuts(possible) response['keymap'] = {} for k, p in arc_kb_shortcuts.items(): response['keymap'][k] = "arc_"+p response['html'] = generate_arc_type_html(projectconf, possible, arc_kb_shortcuts) except: Messager.error('Error selecting arc types!', -1) raise return response
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from httplib import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: data = ann_obj.get_document_text().encode('utf-8') req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request( 'POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(url_soup.path), data, headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally:
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from httplib import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ( '?' + url_soup.query if url_soup.query else '') try: data = ann_obj.get_document_text().encode('utf-8') req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request('POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(url_soup.path), data, headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError(tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally:
def get_span_types(directory): project_conf = ProjectConfiguration(directory) keymap = project_conf.get_kb_shortcuts() hotkey_by_type = dict((v, k) for k, v in keymap.iteritems()) # fill config for nodes for which annotation is configured event_hierarchy = project_conf.get_event_type_hierarchy() event_types = _fill_type_configuration(event_hierarchy, project_conf, hotkey_by_type) entity_hierarchy = project_conf.get_entity_type_hierarchy() entity_types = _fill_type_configuration(entity_hierarchy, project_conf, hotkey_by_type) attribute_hierarchy = project_conf.get_attribute_type_hierarchy() attribute_types = _fill_attribute_configuration(attribute_hierarchy, project_conf) relation_hierarchy = project_conf.get_relation_type_hierarchy() relation_types = _fill_relation_configuration(relation_hierarchy, project_conf, hotkey_by_type) # make visual config available also for nodes for which there is # no annotation config ... unconfigured = [ l for l in project_conf.get_labels() if not project_conf.is_configured_type(l) ] # ... and include the defaults. unconfigured += [VISUAL_SPAN_DEFAULT, VISUAL_ARC_DEFAULT] unconf_types = _fill_visual_configuration(unconfigured, project_conf) return event_types, entity_types, attribute_types, relation_types, unconf_types
def get_base_types(directory): project_conf = ProjectConfiguration(directory) keymap = project_conf.get_kb_shortcuts() hotkey_by_type = dict((v, k) for k, v in keymap.iteritems()) # fill config for nodes for which annotation is configured # calculate once only (this can get heavy) all_connections = project_conf.all_connections() event_hierarchy = project_conf.get_event_type_hierarchy() event_types = _fill_type_configuration(event_hierarchy, project_conf, hotkey_by_type, all_connections) entity_hierarchy = project_conf.get_entity_type_hierarchy() entity_types = _fill_type_configuration(entity_hierarchy, project_conf, hotkey_by_type, all_connections) relation_hierarchy = project_conf.get_relation_type_hierarchy() relation_types = _fill_relation_configuration(relation_hierarchy, project_conf, hotkey_by_type) # make visual config available also for nodes for which there is # no annotation config. Note that defaults (SPAN_DEFAULT etc.) # are included via get_drawing_types() if defined. unconfigured = [l for l in (project_conf.get_labels().keys() + project_conf.get_drawing_types()) if not project_conf.is_configured_type(l)] unconf_types = _fill_visual_configuration(unconfigured, project_conf) return event_types, entity_types, relation_types, unconf_types
def get_attribute_types(directory): project_conf = ProjectConfiguration(directory) entity_attribute_hierarchy = project_conf.get_entity_attribute_type_hierarchy() entity_attribute_types = _fill_attribute_configuration(entity_attribute_hierarchy, project_conf) relation_attribute_hierarchy = project_conf.get_relation_attribute_type_hierarchy() relation_attribute_types = _fill_attribute_configuration(relation_attribute_hierarchy, project_conf) event_attribute_hierarchy = project_conf.get_event_attribute_type_hierarchy() event_attribute_types = _fill_attribute_configuration(event_attribute_hierarchy, project_conf) return entity_attribute_types, relation_attribute_types, event_attribute_types
def get_span_types(directory): project_conf = ProjectConfiguration(directory) keymap = project_conf.get_kb_shortcuts() hotkey_by_type = dict((v, k) for k, v in keymap.iteritems()) # fill config for nodes for which annotation is configured event_hierarchy = project_conf.get_event_type_hierarchy() event_types = _fill_type_configuration(event_hierarchy, project_conf, hotkey_by_type) entity_hierarchy = project_conf.get_entity_type_hierarchy() entity_types = _fill_type_configuration(entity_hierarchy, project_conf, hotkey_by_type) attribute_hierarchy = project_conf.get_attribute_type_hierarchy() attribute_types = _fill_attribute_configuration(attribute_hierarchy, project_conf) relation_hierarchy = project_conf.get_relation_type_hierarchy() relation_types = _fill_relation_configuration(relation_hierarchy, project_conf, hotkey_by_type) # make visual config available also for nodes for which there is # no annotation config ... unconfigured = [l for l in project_conf.get_labels() if not project_conf.is_configured_type(l)] # ... and include the defaults. unconfigured += [VISUAL_SPAN_DEFAULT, VISUAL_ARC_DEFAULT] unconf_types = _fill_visual_configuration(unconfigured, project_conf) return event_types, entity_types, attribute_types, relation_types, unconf_types
def suggest_span_types(collection, document, start, end, text, model): pconf = ProjectConfiguration(real_directory(collection)) for _, _, model_str, model_url in pconf.get_disambiguator_config(): if model_str == model: break else: # We were unable to find a matching model raise SimSemConnectionNotConfiguredError try: quoted_text = quote_plus(text) resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT) except URLError: # TODO: Could give more details raise SimSemConnectionError json = loads(resp.read()) preds = json['result'][text.decode('utf-8')] selected_preds = [] conf_sum = 0 for cat, conf in preds: selected_preds.append(( cat, conf, )) conf_sum += conf if conf_sum >= CUT_OFF: break log_annotation(collection, document, 'DONE', 'suggestion', [ None, None, text, ] + [ selected_preds, ]) # array so that server can control presentation order in UI # independently from scores if needed return { 'types': selected_preds, 'collection': collection, # echo for reference 'document': document, 'start': start, 'end': end, 'text': text, }
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: conn.request( 'POST', url_soup.path, # The document text as body ann_obj.get_document_text().encode('utf8'), headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) finally:
def main(argv=None): import sys import os if argv is None: argv = sys.argv arg = argparser().parse_args(argv[1:]) for fn in arg.files: try: projectconf = ProjectConfiguration(os.path.dirname(fn)) # remove ".a2" or ".rel" suffixes for Annotations to prompt # parsing of .a1 also. # (TODO: temporarily removing .ann also to work around a # bug in TextAnnotations, but this should not be necessary.) nosuff_fn = fn.replace(".a2", "").replace(".rel", "").replace(".ann", "") with annotation.TextAnnotations(nosuff_fn) as ann_obj: issues = verify_annotation(ann_obj, projectconf) for i in issues: print("%s:\t%s" % (fn, i.human_readable_str())) except annotation.AnnotationFileNotFoundError: print("%s:\tFailed check: file not found" % fn, file=sys.stderr) except annotation.AnnotationNotFoundError as e: print("%s:\tFailed check: %s" % (fn, e), file=sys.stderr) if arg.verbose: print("Check complete.", file=sys.stderr)
def get_attribute_types(directory): project_conf = ProjectConfiguration(directory) keymap = project_conf.get_kb_shortcuts() hotkey_by_type = dict((v, k) for k, v in keymap.iteritems()) entity_attribute_hierarchy = project_conf.get_entity_attribute_type_hierarchy() entity_attribute_types = _fill_attribute_configuration(entity_attribute_hierarchy, project_conf, hotkey_by_type) relation_attribute_hierarchy = project_conf.get_relation_attribute_type_hierarchy() relation_attribute_types = _fill_attribute_configuration(relation_attribute_hierarchy, project_conf, hotkey_by_type) event_attribute_hierarchy = project_conf.get_event_attribute_type_hierarchy() event_attribute_types = _fill_attribute_configuration(event_attribute_hierarchy, project_conf, hotkey_by_type) return entity_attribute_types, relation_attribute_types, event_attribute_types
def create_comment(collection, document, id, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) document = path_join(real_dir, document) projectconf = ProjectConfiguration(real_dir) txt_file_path = document + '.' + TEXT_FILE_SUFFIX # XXX what is this doing here? # path_split(document)[0] with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() _set_special_comments(ann_obj, id, comment, mods, undo_resp=undo_resp) mods_json = mods.json_response() if undo_resp: mods_json['undo'] = json_dumps(undo_resp) mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def create_arc(collection, document, origin, target, type, attributes=None, old_type=None, old_target=None, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only # TODO: make consistent across the different editing # functions, integrate ann_obj initialization and checks if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) # if there is a previous annotation and the arcs aren't in # the same category (e.g. relation vs. event arg), process # as delete + create instead of update. if old_type is not None and ( projectconf.is_relation_type(old_type) != projectconf.is_relation_type(type) or projectconf.is_equiv_type(old_type) != projectconf.is_equiv_type(type)): _delete_arc_with_ann(origin.id, old_target, old_type, mods, ann_obj, projectconf) old_target, old_type = None, None if projectconf.is_equiv_type(type): ann = _create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) elif projectconf.is_relation_type(type): ann = _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) else: ann = _create_argument(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) # process comments if ann is not None: _set_comments(ann_obj, ann, comment, mods, undo_resp=undo_resp) elif comment is not None: Messager.warning( 'create_arc: non-empty comment for None annotation (unsupported type for comment?)') mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ( '?' + url_soup.query if url_soup.query else '') try: conn.request('POST', url_soup.path, # The document text as body ann_obj.get_document_text().encode('utf8'), headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError(tagger_token, '%s %s' % (resp.status, resp.reason)) finally:
def create_arc(collection, document, origin, target, type, attributes=None, old_type=None, old_target=None, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only # TODO: make consistent across the different editing # functions, integrate ann_obj initialization and checks if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) if projectconf.is_equiv_type(type): ann =_create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) elif projectconf.is_relation_type(type): ann = _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) else: ann = _create_argument(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) # process comments if ann is not None: _set_comments(ann_obj, ann, comment, mods, undo_resp=undo_resp) elif comment is not None: Messager.warning('create_arc: non-empty comment for None annotation (unsupported type for comment?)') mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: try: # Note: Can we actually fit a whole document in here? quoted_doc_text = quote_plus(ann_obj.get_document_text()) resp = urlopen(tagger_service_url % quoted_doc_text, None, QUERY_TIMEOUT) except URLError: raise TaggerConnectionError(tagger_token) # TODO: Check for errors json_resp = loads(resp.read()) mods = ModificationTracker() for ann_data in json_resp.itervalues(): offsets = ann_data['offsets'] # Note: We do not support discontinuous spans at this point assert len(offsets) == 1, 'discontinuous/null spans' start, end = offsets[0] _id = ann_obj.get_new_id('T') tb = TextBoundAnnotationWithText( start, end, _id, ann_data['type'], ann_data['text'] ) mods.addition(tb) ann_obj.add_annotation(tb) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp
def suggest_span_types(collection, document, start, end, text, model): pconf = ProjectConfiguration(real_directory(collection)) for _, _, model_str, model_url in pconf.get_disambiguator_config(): if model_str == model: break else: # We were unable to find a matching model raise SimSemConnectionNotConfiguredError try: quoted_text = quote_plus(text) resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT) except URLError: # TODO: Could give more details raise SimSemConnectionError json = loads(resp.read()) preds = json['result'][text.decode('utf-8')] selected_preds = [] conf_sum = 0 for cat, conf in preds: selected_preds.append((cat, conf, )) conf_sum += conf if conf_sum >= CUT_OFF: break log_annotation(collection, document, 'DONE', 'suggestion', [None, None, text, ] + [selected_preds, ]) # array so that server can control presentation order in UI # independently from scores if needed return { 'types': selected_preds, 'collection': collection, # echo for reference 'document': document, 'start': start, 'end': end, 'text': text, }
def _get_db_path(database, collection): if collection is None: # TODO: default to WORK_DIR config? return (None, Simstring.DEFAULT_UNICODE) else: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() try: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() for entry in norm_conf: # TODO THIS IS WRONG dbname, dbpath, dbunicode = entry[0], entry[3], entry[4] if dbname == database: return (dbpath, dbunicode) # not found in config. Messager.warning('DB ' + database + ' not defined in config for ' + collection + ', falling back on default.') return (None, Simstring.DEFAULT_UNICODE) except Exception: # whatever goes wrong, just warn and fall back on the default. Messager.warning('Failed to get DB path from config for ' + collection + ', falling back on default.') return (None, Simstring.DEFAULT_UNICODE)
def _get_db_path(database, collection): if collection is None: # TODO: default to WORK_DIR config? return None else: try: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() for entry in norm_conf: dbname, dbpath = entry[0], entry[3] if dbname == database: return dbpath # not found in config. Messager.warning('DB ' + database + ' not defined in config for ' + collection + ', falling back on default.') return None except Exception: # whatever goes wrong, just warn and fall back on the default. Messager.warning('Failed to get DB path from config for ' + collection + ', falling back on default.') return None
def allowed_to_read(real_path): data_path = path_join('/', relpath(real_path, DATA_DIR)) # add trailing slash to directories, required to comply to robots.txt if isdir(real_path): data_path = '%s/' % ( data_path ) real_dir = dirname(real_path) robotparser = ProjectConfiguration(real_dir).get_access_control() if robotparser is None: return True # default allow try: user = get_session().get('user') except KeyError: user = None if user is None: user = '******' #display_message('Path: %s, dir: %s, user: %s, ' % (data_path, real_dir, user), type='error', duration=-1) return robotparser.can_fetch(user, data_path)
def allowed_to_read(real_path): data_path = path_join('/', relpath(real_path, DATA_DIR)) # add trailing slash to directories, required to comply to robots.txt if isdir(real_path): data_path = '%s/' % (data_path) real_dir = dirname(real_path) robotparser = ProjectConfiguration(real_dir).get_access_control() if robotparser is None: return True # default allow try: user = get_session().get('user') except KeyError: user = None if user is None: user = '******' #display_message('Path: %s, dir: %s, user: %s, ' % (data_path, real_dir, user), type='error', duration=-1) return robotparser.can_fetch(user, data_path)
def reverse_arc(collection, document, origin, target, type, attributes=None): directory = collection # undo_resp = {} # TODO real_dir = real_directory(directory) # mods = ModificationTracker() # TODO projectconf = ProjectConfiguration(real_dir) document = urllib.parse.unquote(document) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) if projectconf.is_equiv_type(type): Messager.warning('Cannot reverse Equiv arc') elif not projectconf.is_relation_type(type): Messager.warning('Can only reverse configured binary relations') else: # OK to reverse found = None # TODO: more sensible lookup for ann in ann_obj.get_relations(): if (ann.arg1 == origin and ann.arg2 == target and ann.type == type): found = ann break if found is None: Messager.error( 'reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type))) else: # found it; just adjust this found.arg1, found.arg2 = found.arg2, found.arg1 # TODO: modification tracker json_response = {} json_response['annotations'] = _json_from_ann(ann_obj) return json_response
def getAnnObject(collection, document): try: real_dir = real_directory(collection) except: real_dir = collection app_path = WORK_DIR + "/application/" full_name = collection + document full_name = full_name.replace("/", "") if (os.path.isfile(app_path + full_name)): temp = open(app_path + full_name, 'rb') ann = pickle_load(temp) temp.close() else: ann = TextAnnotations(real_dir + document) ann = SimpleAnnotations(ann) ann.folia = {} try: #TODO:good error message ann.folia = get_extra_info(collection, document) except Exception as e: ann.folia = {} Messager.error('Error: get extra folia info() failed: %s' % e) #Validation: try: docdir = os.path.dirname(ann._document) string = session.load_conf()["config"] val = json.loads(string)["validationOn"] #validate if config enables it and if it's not already done. if val: if not ann.validated: projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann, projectconf) else: issues = ann.issues else: ann.validated = False issues = [] except session.NoSessionError: issues = [] except KeyError: issues = [] except Exception as e: # TODO add an issue about the failure? issues = [] Messager.error('Error: validation failed: %s' % e) ann.issues = issues temp = open(app_path + full_name, 'wb') pickle_dump(ann, temp) temp.close() return ann
def convert(path,doc): #path is path to the file without extension projectconf = ProjectConfiguration(path) path = path_join(path,doc) ann = Annotations(path+".ann") doc = build_text_structure(ann,path+".txt") add_relations(doc,ann) add_comments(doc,ann) #~ ent_set=xml(build_entity_set(doc)) #~ rel_set=xml(build_relations_set(doc)) #~ temp=open ("entiteit_set.xml",'w') #~ temp.write(ent_set) #~ temp.close() #~ rel=open ("relation_set.xml",'w') #~ rel.write(rel_set) #~ rel.close() doc.save(path+".xml")
def delete_arc(collection, document, origin, target, type): directory = collection real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) _delete_arc_with_ann(origin, target, type, mods, ann_obj, projectconf) mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def get_span_types(directory): project_conf = ProjectConfiguration(directory) keymap = project_conf.get_kb_shortcuts() hotkey_by_type = dict((v, k) for k, v in keymap.iteritems()) event_hierarchy = project_conf.get_event_type_hierarchy() event_types = _get_subtypes_for_type(event_hierarchy, project_conf, hotkey_by_type, directory) entity_hierarchy = project_conf.get_entity_type_hierarchy() entity_types = _get_subtypes_for_type(entity_hierarchy, project_conf, hotkey_by_type, directory) attribute_hierarchy = project_conf.get_attribute_type_hierarchy() attribute_types = _get_attribute_type_info(attribute_hierarchy, project_conf, directory) relation_hierarchy = project_conf.get_relation_type_hierarchy() relation_types = _get_subtypes_for_type(relation_hierarchy, project_conf, hotkey_by_type, directory) return event_types, entity_types, attribute_types, relation_types
def get_annotator_config(directory): # TODO: "annotator" is a very confusing term for a web service # that does automatic annotation in the context of a tool # where most annotators are expected to be human. Rethink. return ProjectConfiguration(directory).get_annotator_config()
def _enrich_json_with_data(j_dic, ann_obj): # TODO: figure out if there's a reason for all the unicode() # invocations here; remove if not. # We collect trigger ids to be able to link the textbound later on trigger_ids = set() for event_ann in ann_obj.get_events(): trigger_ids.add(event_ann.trigger) j_dic['events'].append([ unicode(event_ann.id), unicode(event_ann.trigger), event_ann.args ]) for rel_ann in ann_obj.get_relations(): j_dic['relations'].append([ unicode(rel_ann.id), unicode(rel_ann.type), [(rel_ann.arg1l, rel_ann.arg1), (rel_ann.arg2l, rel_ann.arg2)] ]) for tb_ann in ann_obj.get_textbounds(): #j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end] j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.spans] # If we spotted it in the previous pass as a trigger for an # event or if the type is known to be an event type, we add it # as a json trigger. # TODO: proper handling of disconnected triggers. Currently # these will be erroneously passed as 'entities' if unicode(tb_ann.id) in trigger_ids: j_dic['triggers'].append(j_tb) # special case for BioNLP ST 2013 format: send triggers # also as entities for those triggers that are referenced # from annotations other than events (#926). if BIONLP_ST_2013_COMPATIBILITY: if tb_ann.id in ann_obj.externally_referenced_triggers: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [ j_tb, ] else: try: j_dic['entities'].append(j_tb) except KeyError: j_dic['entities'] = [ j_tb, ] for eq_ann in ann_obj.get_equivs(): j_dic['equivs'].append( (['*', eq_ann.type] + [e for e in eq_ann.entities])) for att_ann in ann_obj.get_attributes(): j_dic['attributes'].append([ unicode(att_ann.id), unicode(att_ann.type), unicode(att_ann.target), att_ann.value ]) for norm_ann in ann_obj.get_normalizations(): j_dic['normalizations'].append([ unicode(norm_ann.id), unicode(norm_ann.type), unicode(norm_ann.target), unicode(norm_ann.refdb), unicode(norm_ann.refid), unicode(norm_ann.reftext) ]) for com_ann in ann_obj.get_oneline_comments(): comment = [ unicode(com_ann.target), unicode(com_ann.type), com_ann.tail.strip() ] try: j_dic['comments'].append(comment) except KeyError: j_dic['comments'] = [ comment, ] if ann_obj.failed_lines: error_msg = 'Unable to parse the following line(s):\n%s' % ( '\n'.join([ ( '%s: %s' % ( # The line number is off by one unicode(line_num + 1), unicode(ann_obj[line_num]))).strip() for line_num in ann_obj.failed_lines ])) Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3) j_dic['mtime'] = ann_obj.ann_mtime j_dic['ctime'] = ann_obj.ann_ctime try: # XXX avoid digging the directory from the ann_obj import os docdir = os.path.dirname(ann_obj._document) if options_get_validation(docdir) in ( 'all', 'full', ): from verify_annotations import verify_annotation projectconf = ProjectConfiguration(docdir) issues = verify_annotation(ann_obj, projectconf) else: issues = [] except Exception, e: # TODO add an issue about the failure? issues = [] Messager.error('Error: verify_annotation() failed: %s' % e, -1)
def get_disambiguator_config(directory): return ProjectConfiguration(directory).get_disambiguator_config()
def get_normalization_config(directory): return ProjectConfiguration(directory).get_normalization_config()
def delete_arc(collection, document, origin, target, type): directory = collection real_dir = real_directory(directory) document = path_join(real_dir, document) txt_file_path = document + '.' + TEXT_FILE_SUFFIX with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() # This can be an event or an equiv #TODO: Check for None! try: event_ann = ann_obj.get_ann_by_id(origin) # Try if it is an event arg_tup = (type, unicode(target)) if arg_tup in event_ann.args: before = unicode(event_ann) event_ann.args.remove(arg_tup) mods.change(before, event_ann) ''' if not event_ann.args: # It was the last argument tuple, remove it all try: ann_obj.del_annotation(event_ann) mods.deletion(event_ann) except DependingAnnotationDeleteError, e: #XXX: Old message api print 'Content-Type: application/json\n' print dumps(e.json_error_response()) return ''' else: # What we were to remove did not even exist in the first place pass except AttributeError: projectconf = ProjectConfiguration(real_dir) if projectconf.is_equiv_type(type): # It is an equiv then? #XXX: Slow hack! Should have a better accessor! O(eq_ann) for eq_ann in ann_obj.get_equivs(): # We don't assume that the ids only occur in one Equiv, we # keep on going since the data "could" be corrupted if (unicode(origin) in eq_ann.entities and unicode(target) in eq_ann.entities): before = unicode(eq_ann) eq_ann.entities.remove(unicode(origin)) eq_ann.entities.remove(unicode(target)) mods.change(before, eq_ann) if len(eq_ann.entities) < 2: # We need to delete this one try: ann_obj.del_annotation(eq_ann) mods.deletion(eq_ann) except DependingAnnotationDeleteError, e: #TODO: This should never happen, dep on equiv #print 'Content-Type: application/json\n' # TODO: Proper exception here! Messager.error(e.json_error_response()) return {} elif type in projectconf.get_relation_types(): for ann in ann_obj.get_relations(): if ann.type == type and ann.arg1 == origin and ann.arg2 == target: ann_obj.del_annotation(ann) mods.deletion(ann) break else:
def get_search_config(directory): return ProjectConfiguration(directory).get_search_config()
def create_arc(collection, document, origin, target, type, attributes=None, old_type=None, old_target=None, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: #Messager.info('Creating span collection(' + str(collection) + ') document(' + str(document) + ') origin(' + str(origin) + ') target(' + str(target) + ') exists(' + str(ann_obj.ann_exists_in_offset('Arg1:' + str(origin) + '_Arg2:' + str(target))) + ')' ) if ann_obj.ann_exists_in_offset('Arg1:' + str(origin) + '_Arg2:' + str(target)): raise SpanOffsetOverlapError([str(ann_obj.get_ann_by_id(origin)).split()[-1], str(ann_obj.get_ann_by_id(target)).split()[-1]]) # bail as quick as possible if read-only # TODO: make consistent across the different editing # functions, integrate ann_obj initialization and checks if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) # if there is a previous annotation and the arcs aren't in # the same category (e.g. relation vs. event arg), process # as delete + create instead of update. if old_type is not None and ( projectconf.is_relation_type(old_type) != projectconf.is_relation_type(type) or projectconf.is_equiv_type(old_type) != projectconf.is_equiv_type(type)): _delete_arc_with_ann(origin.id, old_target, old_type, mods, ann_obj, projectconf) old_target, old_type = None, None if projectconf.is_equiv_type(type): ann =_create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) elif projectconf.is_relation_type(type): ann = _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) else: ann = _create_argument(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) # process comments if ann is not None: _set_comments(ann_obj, ann, comment, mods, undo_resp=undo_resp) elif comment is not None: Messager.warning('create_arc: non-empty comment for None annotation (unsupported type for comment?)') mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def create_arc(collection, document, origin, target, type, old_type=None, old_target=None): directory = collection real_dir = real_directory(directory) mods = ModificationTracker() real_dir = real_directory(directory) projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # Dirty hack to bail as quick as possible if read-only # TODO: why only here? The checking of readonly should be # consistent across the different editing functions. if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) # Ugly check, but we really get no other information if type == "Equiv": # It is an Equiv if old_type == "Equiv": # "Change" from Equiv to Equiv is harmless # TODO: some message needed? pass else: assert old_type is None, "attempting to change Equiv, not supported" ann = EquivAnnotation(type, [unicode(origin.id), unicode(target.id)], "") ann_obj.add_annotation(ann) mods.addition(ann) elif type in projectconf.get_relation_types(): if old_type is not None or old_target is not None: assert type in projectconf.get_relation_types(), ( 'attempting to convert relation to non-relation "%s" ' % (target.type,) ) + ("(legit types: %s)" % (unicode(projectconf.get_relation_types()),)) sought_target = old_target if old_target is not None else target.id sought_type = old_type if old_type is not None else type # We are to change the type and/or target found = None for ann in ann_obj.get_relations(): if ann.arg2 == sought_target and ann.type == sought_type: found = ann break # Did it exist and is changed?, otherwise we do nothing if found is not None and (found.arg2 != target.id or found.type != type): before = unicode(found) found.arg2 = target.id found.type = type mods.change(before, found) else: # Create a new annotation # TODO: Assign a suitable letter new_id = ann_obj.get_new_id("R") rel = projectconf.get_relation_by_type(type) assert rel is not None and len(rel.arg_list) == 2 a1l, a2l = rel.arg_list ann = BinaryRelationAnnotation(new_id, type, a1l, origin.id, a2l, target.id, "\t") mods.addition(ann) ann_obj.add_annotation(ann) else: try: arg_tup = (type, unicode(target.id)) # Is this an addition or an update? if old_type is None and old_target is None: if arg_tup not in origin.args: before = unicode(origin) origin.args.append(arg_tup) mods.change(before, origin) else: # It already existed as an arg, we were called to do nothing... pass else: # Construct how the old arg would have looked like old_arg_tup = (type if old_type is None else old_type, target if old_target is None else old_target) if old_arg_tup in origin.args and arg_tup not in origin.args: before = unicode(origin) origin.args.remove(old_arg_tup) origin.args.append(arg_tup) mods.change(before, origin) else: # Collision etc. don't do anything pass except AttributeError: # The annotation did not have args, it was most likely an entity # thus we need to create a new Event... new_id = ann_obj.get_new_id("E") ann = EventAnnotation(origin.id, [arg_tup], new_id, origin.type, "") ann_obj.add_annotation(ann) mods.addition(ann) if DEBUG: mods_json = mods.json_response() else: mods_json = {} # Hack since we don't have the actual text, should use a factory? txt_file_path = ann_obj.get_document() + "." + TEXT_FILE_SUFFIX j_dic = _json_from_ann_and_txt(ann_obj, txt_file_path) mods_json["annotations"] = j_dic return mods_json
def filter_layers(ann, path): #Added by Sander Naert to disable the visualisation of same annotations try: string = session.load_conf()["config"] val = json.loads(string)["layers"] except session.NoSessionError: val = [] except KeyError: val = [] except Exception as e: val = [] Messager.error("Error while enabling/disabling layers: " + str(e)) proj = ProjectConfiguration(path) forbidden_entities = set() forbidden_ann = [] for i in val: forbidden_ann.append(i) temp_array = [] #Remove forbidden entities for i in ann["entities"]: if i[1] in forbidden_ann: forbidden_entities.add(i[0]) else: temp_array.append(i) ann["entities"] = temp_array #Remove forbidden triggers temp_array = [] forbidden_events = [] for i in ann["triggers"]: if i[1] in forbidden_ann: forbidden_events.append(i[0]) else: temp_array.append(i) ann["triggers"] = temp_array #Remove forbidden events temp_array = [] for i in ann["events"]: if i[1] in forbidden_events: pass else: #delete references to removed entities i[2][:] = [ role for role in i[2] if not role[1] in forbidden_entities ] temp_array.append(i) ann["events"] = temp_array #Remove forbidden relations temp_array = [] for i in ann["relations"]: if i[1] in forbidden_ann: pass else: #if an arg points to an forbidden_ent then also remove this relation roles = [role for role in i[2] if role[1] in forbidden_entities] if not roles: temp_array.append(i) ann["relations"] = temp_array #Remove forbidden attributes temp_array = [] for i in ann["attributes"]: if i[1] in forbidden_ann: pass elif not i[2] in forbidden_entities: temp_array.append(i) ann["attributes"] = temp_array return ann
def _create_span(collection, document, offsets, _type, attributes=None, normalizations=None, _id=None, comment=None): if _offset_overlaps(offsets): raise SpanOffsetOverlapError(offsets) directory = collection undo_resp = {} _attributes = _parse_attributes(attributes) _normalizations = _parse_span_normalizations(normalizations) #log_info('ATTR: %s' %(_attributes, )) real_dir = real_directory(directory) document = path_join(real_dir, document) projectconf = ProjectConfiguration(real_dir) txt_file_path = document + '.' + TEXT_FILE_SUFFIX path_split(document)[0] with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) mods = ModificationTracker() if _id is not None: # We are to edit an existing annotation tb_ann, e_ann = _edit_span(ann_obj, mods, _id, offsets, projectconf, _attributes, _type, undo_resp=undo_resp) else: # We are to create a new annotation tb_ann, e_ann = __create_span( ann_obj, mods, _type, offsets, txt_file_path, projectconf, _attributes) undo_resp['action'] = 'add_tb' if e_ann is not None: undo_resp['id'] = e_ann.id else: undo_resp['id'] = tb_ann.id # Determine which annotation attributes, normalizations, # comments etc. should be attached to. If there's an event, # attach to that; otherwise attach to the textbound. if e_ann is not None: # Assign to the event, not the trigger target_ann = e_ann else: target_ann = tb_ann # Set attributes _set_attributes(ann_obj, target_ann, _attributes, mods, undo_resp=undo_resp) # Set normalizations _set_normalizations(ann_obj, target_ann, _normalizations, mods, undo_resp=undo_resp) # Set comments if tb_ann is not None: _set_comments(ann_obj, target_ann, comment, mods, undo_resp=undo_resp) if tb_ann is not None: mods_json = mods.json_response() else: # Hack, probably we had a new-line in the span mods_json = {} Messager.error( 'Text span contained new-line, rejected', duration=3) if undo_resp: mods_json['undo'] = json_dumps(undo_resp) mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) print("tagger", tagger, file=sys.stderr) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) path_join(real_directory(collection), document) # print("path_join(real_directory(collection), document)", path_join(real_directory(collection), document), file=sys.stderr) # print("tagger_token", tagger_token, file=sys.stderr) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: # print("ann_obj", document, file=sys.stderr) url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection # print("HTTPConnection", HTTPConnection, file=sys.stderr) elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from http.client import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: # Note: Trout slapping for anyone sending Unicode objects here data = str(path_join( real_directory(collection), document)) + "#*^$#" + ann_obj.get_document_text() data = data.encode('utf-8') # print("data", type(data),data, file=sys.stderr) # print("data", ann_obj, file=sys.stderr) req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request( 'POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(service_url), data, headers=req_headers) # httpConnection = http.client.HTTPConnection(url_soup.netloc) # httpConnection.request('GET', str(service_url), headers=req_headers) # response = httpConnection.getresponse() except SocketError as e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # print("resp-------------", resp.read(), file=sys.stderr) # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally: if conn is not None: conn.close() try: json_resp = loads(resp_data) # print("json_resp", json_resp, file=sys.stderr) except ValueError: raise InvalidTaggerResponseError(tagger_token, resp_data) mods = ModificationTracker() cidmap = {} # print("json_resp.items:::::::::::::", json_resp.items(), file=sys.stderr) for cid, ann in ((i, a) for i, a in json_resp.items() if _is_textbound(a)): assert 'offsets' in ann, 'Tagger response lacks offsets' offsets = ann['offsets'] # print("json_resp.items:::::::::::::", offsets, file=sys.stderr) assert 'type' in ann, 'Tagger response lacks type' _type = ann['type'] assert 'texts' in ann, 'Tagger response lacks texts' texts = ann['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets' assert len(texts) == len( offsets ), 'Tagger response has different numbers of offsets and texts' start, end = offsets[0] text = texts[0] # print("offsets, _type, texts, text:", offsets, _type, texts, text, file=sys.stderr) _id = ann_obj.get_new_id('T') print("_id", _id, file=sys.stderr) cidmap[cid] = _id tb = TextBoundAnnotationWithText(offsets, _id, _type, text, " " + ' '.join(texts[1:])) mods.addition(tb) ann_obj.add_annotation(tb) for norm in (a for a in json_resp.values() if _is_normalization(a)): try: _type = norm['type'] target = norm['target'] refdb = norm['refdb'] refid = norm['refid'] except KeyError as e: raise # TODO _id = ann_obj.get_new_id('N') target = cidmap[target] na = NormalizationAnnotation(_id, _type, target, refdb, refid, '') mods.addition(na) ann_obj.add_annotation(na) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp
def get_linker_config(directory): return ProjectConfiguration(directory).get_linker_config()
def create_arc(collection, document, origin, target, type, old_type=None, old_target=None): directory = collection real_dir = real_directory(directory) mods = ModificationTracker() real_dir = real_directory(directory) projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only # TODO: make consistent across the different editing # functions, integrate ann_obj initialization and checks if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) if projectconf.is_equiv_type(type): # It is an Equiv if projectconf.is_equiv_type(old_type): # "Change" from Equiv to Equiv is harmless # TODO: some message needed? pass else: assert old_type is None, 'attempting to change equiv relation to non-equiv relation, operation not supported' ann = EquivAnnotation(type, [unicode(origin.id), unicode(target.id)], '') ann_obj.add_annotation(ann) mods.addition(ann) elif projectconf.is_relation_type(type): if old_type is not None or old_target is not None: assert type in projectconf.get_relation_types(), ( ('attempting to convert relation to non-relation "%s" ' % (target.type, )) + ('(legit types: %s)' % (unicode(projectconf.get_relation_types()), ))) sought_target = (old_target if old_target is not None else target.id) sought_type = (old_type if old_type is not None else type) # We are to change the type and/or target found = None for ann in ann_obj.get_relations(): if ann.arg2 == sought_target and ann.type == sought_type: found = ann break # Did it exist and is changed?, otherwise we do nothing if found is not None and (found.arg2 != target.id or found.type != type): before = unicode(found) found.arg2 = target.id found.type = type mods.change(before, found) else: # Create a new annotation # TODO: Assign a suitable letter new_id = ann_obj.get_new_id('R') rel = projectconf.get_relation_by_type(type) assert rel is not None and len(rel.arg_list) == 2 a1l, a2l = rel.arg_list ann = BinaryRelationAnnotation(new_id, type, a1l, origin.id, a2l, target.id, '\t') mods.addition(ann) ann_obj.add_annotation(ann) else: try: arg_tup = (type, unicode(target.id)) # Is this an addition or an update? if old_type is None and old_target is None: if arg_tup not in origin.args: before = unicode(origin) origin.add_argument(type, unicode(target.id)) mods.change(before, origin) else: # It already existed as an arg, we were called to do nothing... pass else: # Construct how the old arg would have looked like old_arg_tup = (type if old_type is None else old_type, target if old_target is None else old_target) if old_arg_tup in origin.args and arg_tup not in origin.args: before = unicode(origin) origin.args.remove(old_arg_tup) origin.add_argument(type, unicode(target.id)) mods.change(before, origin) else: # Collision etc. don't do anything pass except AttributeError: # The annotation did not have args, it was most likely an entity # thus we need to create a new Event... new_id = ann_obj.get_new_id('E') ann = EventAnnotation( origin.id, [arg_tup], new_id, origin.type, '' ) ann_obj.add_annotation(ann) mods.addition(ann) mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from http.client import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ( '?' + url_soup.query if url_soup.query else '') try: data = ann_obj.get_document_text().encode('utf-8') req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request('POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(service_url), data, headers=req_headers) except SocketError as e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally: if conn is not None: conn.close() try: json_resp = loads(resp_data) except ValueError: raise InvalidTaggerResponseError(tagger_token, resp_data) mods = ModificationTracker() cidmap = {} for cid, ann in ((i, a) for i, a in json_resp.items() if _is_textbound(a)): assert 'offsets' in ann, 'Tagger response lacks offsets' offsets = ann['offsets'] assert 'type' in ann, 'Tagger response lacks type' _type = ann['type'] assert 'texts' in ann, 'Tagger response lacks texts' texts = ann['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets' assert len(texts) == len( offsets), 'Tagger response has different numbers of offsets and texts' start, end = offsets[0] text = texts[0] _id = ann_obj.get_new_id('T') cidmap[cid] = _id tb = TextBoundAnnotationWithText( offsets, _id, _type, text, " " + ' '.join(texts[1:])) mods.addition(tb) ann_obj.add_annotation(tb) for norm in (a for a in json_resp.values() if _is_normalization(a)): try: _type = norm['type'] target = norm['target'] refdb = norm['refdb'] refid = norm['refid'] except KeyError as e: raise # TODO _id = ann_obj.get_new_id('N') target = cidmap[target] na = NormalizationAnnotation(_id, _type, target, refdb, refid, '') mods.addition(na) ann_obj.add_annotation(na) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp
for docname in base_names: try: with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except: # TODO: error reporting issue_count = -1 docstats.append( [tb_count, rel_count, event_count, issue_count]) except Exception, e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) # Cache the statistics try:
def get_statistics(directory, base_names, use_cache=True): # Check if we have a cache of the costly satistics generation # Also, only use it if no file is newer than the cache itself cache_file_path = get_stat_cache_by_dir(directory) try: cache_mtime = getmtime(cache_file_path) except OSError as e: if e.errno == 2: cache_mtime = -1 else: raise try: if (not isfile(cache_file_path) # Has config.py been changed? or getmtime(get_config_py_path()) > cache_mtime # Any file has changed in the dir since the cache was generated or any(True for f in listdir(directory) if (getmtime(path_join(directory, f)) > cache_mtime # Ignore hidden files and not f.startswith('.'))) # The configuration is newer than the cache or getmtime(get_config_path(directory)) > cache_mtime): generate = True docstats = [] else: generate = False try: with open(cache_file_path, 'rb') as cache_file: docstats = pickle_load(cache_file) if len(docstats) != len(base_names): Messager.warning( 'Stats cache %s was incomplete; regenerating' % cache_file_path) generate = True docstats = [] except UnpicklingError: # Corrupt data, re-generate Messager.warning( 'Stats cache %s was corrupted; regenerating' % cache_file_path, -1) generate = True except EOFError: # Corrupt data, re-generate generate = True except OSError as e: Messager.warning( 'Failed checking file modification times for stats cache check; regenerating' ) generate = True if not use_cache: generate = True # "header" and types stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")] if options_get_validation(directory) != 'none': stat_types.append(("Issues", "int")) if generate: # Generate the document statistics from scratch from .annotation import JOINED_ANN_FILE_SUFF log_info('generating statistics for "%s"' % directory) docstats = [] for docname in base_names: try: with Annotations(path_join(directory, docname), read_only=True) as ann_obj: tb_count = len([a for a in ann_obj.get_entities()]) rel_count = (len([a for a in ann_obj.get_relations()]) + len([a for a in ann_obj.get_equivs()])) event_count = len([a for a in ann_obj.get_events()]) if options_get_validation(directory) == 'none': docstats.append([tb_count, rel_count, event_count]) else: # verify and include verification issue count try: from projectconfig import ProjectConfiguration projectconf = ProjectConfiguration(directory) from verify_annotations import verify_annotation issues = verify_annotation(ann_obj, projectconf) issue_count = len(issues) except BaseException: # TODO: error reporting issue_count = -1 docstats.append( [tb_count, rel_count, event_count, issue_count]) except Exception as e: log_info('Received "%s" when trying to generate stats' % e) # Pass exceptions silently, just marking stats missing docstats.append([-1] * len(stat_types)) # Cache the statistics try: with open(cache_file_path, 'wb') as cache_file: pickle_dump(docstats, cache_file) except IOError as e: Messager.warning( "Could not write statistics cache file to directory %s: %s" % (directory, e)) return stat_types, docstats