def __init__(self, directory): # debugging (note: latter test for windows paths) if directory[:1] != "/" and not re.search(r'^[a-zA-Z]:\\', directory): Messager.debug( "Project config received relative directory ('%s'), " "configuration may not be found." % directory, duration=-1) self.directory = directory
def attributes_for(self, ann_type): """ Returs a list of the possible attribute types for an annotation of the given type. """ attrs = [] for attr in com.get_attribute_type_list(self.directory): if attr == cst.SEPARATOR_STR: continue if 'Arg' not in attr.arguments: Messager.warning( "Project configuration: config error: attribute '%s' " "lacks 'Arg:' specification." % attr.storage_form()) continue types = attr.arguments['Arg'] if ((ann_type in types) or ('<ANY>' in types) or (self.is_event_type(ann_type) and '<EVENT>' in types) or (self.is_physical_entity_type(ann_type) and '<ENTITY>' in types) or (self.is_relation_type(ann_type) and '<RELATION>' in types)): attrs.append(attr.storage_form()) return attrs
def wrapper(*args, **kwds): """ Add message sending to func """ if DEBUG: Messager.warning( ('Client sent "%s" action ' 'which is marked as deprecated') % func.__name__, ) return func(*args, **kwds)
def test_01_warning(self): """ test warning level """ Messager.warning(u'Hello 世界!') json_dic = {} Messager.output_json(json_dic) self.assertEqual( json_dic, {'messages': [(u'Hello \u4e16\u754c\uff01', 'warning', 3)]})
def test_02_info(self): """ test info level """ Messager.info(u'Hello 世界!') json_dic = {} Messager.output_json(json_dic) self.assertEqual( json_dic, {'messages': [(u'Hello \u4e16\u754c\uff01', 'comment', 3)]})
def test_03_error(self): """ test error level """ Messager.error(u'Hello 世界!') json_dic = {} Messager.output_json(json_dic) self.assertEqual( json_dic, {'messages': [(u'Hello \u4e16\u754c\uff01', 'error', 3)]})
def get_labels(directory): l = {} for t in get_visual_configs(directory)[0][cst.LABEL_SECTION]: if t.storage_form() in l: Messager.warning( "In configuration, labels for '%s' defined more " "than once. Only using the last set." % t.storage_form(), -1) # first is storage for, rest are labels. l[t.storage_form()] = t.terms[1:] return l
def test_04_debug(self): """ test debug level """ Messager.debug(u'Hello 世界!') json_dic = {} Messager.output_json(json_dic) self.assertEqual( json_dic, {'messages': [(u'Hello \u4e16\u754c\uff01', 'debug', 3)]})
def get_configs(directory, filename, defaultstr, minconf, sections, optional_sections): if (directory, filename) not in _GET_CONFIGS_CACHE: configstr, source = __read_first_in_directory_tree(directory, filename) if configstr is None: # didn't get one; try default dir and fall back to the default configstr = __read_or_default(filename, defaultstr) if configstr == defaultstr: Messager.info( "Project configuration: no configuration file (%s) " "found, using default." % filename, 5) source = "[default]" else: source = filename # try to parse what was found, fall back to minimal config try: configs, section_labels = __parse_configs(configstr, source, sections, optional_sections) except InvalidProjectConfigException: Messager.warning( "Project configuration: Falling back to minimal default. " "Configuration is likely wrong.", 5) configs = minconf section_labels = dict([(a, a) for a in sections]) # very, very special case processing: if we have a type # "Equiv" defined in a "relations" section that doesn't # specify a "<REL-TYPE>", automatically fill "symmetric" and # "transitive". This is to support older configurations that # rely on the type "Equiv" to identify the relation as an # equivalence. if 'relations' in configs: for r in configs['relations']: if r == cst.SEPARATOR_STR: continue if (r.storage_form() == "Equiv" and "<REL-TYPE>" not in r.special_arguments): # this was way too much noise; will only add in after # at least most configs are revised. # Messager.warning('Note: "Equiv" defined in config ' # 'without "<REL-TYPE>"; assuming ' # 'symmetric and transitive. Consider ' # 'revising config to add ' # '"<REL-TYPE>:symmetric-transitive" ' # 'to definition.') r.special_arguments["<REL-TYPE>"] = [ "symmetric", "transitive" ] _GET_CONFIGS_CACHE[(directory, filename)] = (configs, section_labels) return _GET_CONFIGS_CACHE[(directory, filename)]
def get_search_config(self): search_config = [] for r in com.get_search_config_list(self.directory): if '<URL>' not in r.special_arguments: Messager.warning( 'Project configuration: config error: missing <URL> ' 'specification for %s search.' % r.storage_form()) else: search_config.append( (r.storage_form(), r.special_arguments['<URL>'][0])) return search_config
def _listdir(directory, user): # return listdir(directory) try: assert_allowed_to_read(directory, user) return [ f for f in listdir(directory) if not _is_hidden(f) and allowed_to_read(path_join(directory, f), user) ] except OSError as exception: Messager.error("Error listing %s: %s" % (directory, exception)) raise AnnotationCollectionNotFoundError(directory)
def multiple_allowed_arguments(self, atype): """ Returns the argument types that are allowed to be filled more than once for an annotation of the given type. """ node = com.get_node_by_storage_form(self.directory, atype) if node is None: Messager.warning( "Project configuration: unknown event type %s. " "Configuration may be wrong." % atype) return [] return node.multiple_allowed_arguments()
def mandatory_arguments(self, atype): """ Returns the mandatory argument types that must be present for an annotation of the given type. """ node = com.get_node_by_storage_form(self.directory, atype) if node is None: Messager.warning( "Project configuration: unknown event type %s. " "Configuration may be wrong." % atype) return [] return node.mandatory_arguments()
def argument_minimum_count(self, atype, arg): """ Returns the minimum number of times that the given argument is allowed to be filled for an annotation of the given type. """ node = com.get_node_by_storage_form(self.directory, atype) if node is None: Messager.warning( "Project configuration: unknown event type %s. " "Configuration may be wrong." % atype) return 0 return node.argument_minimum_count(arg)
def _store_cache_stat(docstats, cache_file_path, directory): """ Cache the statistics """ try: with open(cache_file_path, 'wb') as cache_file: pickle_dump(docstats, cache_file, protocol=constants.PICKLE_PROTOCOL) except IOError as exception: Messager.warning( "Could not write statistics cache file to directory %s: %s" % (directory, exception))
def get_node_by_storage_form(directory, term): if directory not in _GET_NODE_BY_STORAGE_FORM_CACHE: d = {} for e in get_entity_type_list(directory) + get_event_type_list( directory): t = e.storage_form() if t in d: Messager.warning( "Project configuration: term %s appears multiple times, " "only using last. Configuration may be wrong." % t, 5) d[t] = e _GET_NODE_BY_STORAGE_FORM_CACHE[directory] = d return _GET_NODE_BY_STORAGE_FORM_CACHE[directory].get(term, None)
def __parse_kb_shortcuts(shortcutstr, default, source): shortcuts = {} for l in shortcutstr.split("\n"): l = l.strip() if l == "" or l[:1] == "#": continue key, type_ = re.split(r'[ \t]+', l) if key in shortcuts: Messager.warning("Project configuration: keyboard shortcut " "for '%s' defined multiple times. Ignoring " "all but first ('%s')" % (key, shortcuts[key])) else: shortcuts[key] = type_ return shortcuts
def __init__(self, terms, args=None): if args is None: args = [] self.terms, self.args = terms, args if not terms or len([t for t in terms if t == ""]) != 0: Messager.debug("Empty term in configuration", duration=-1) raise InvalidProjectConfigException # unused if any of the terms marked with "!" self.unused = False for i in range(len(self.terms)): if self.terms[i][0] == "!": self.terms[i] = self.terms[i][1:] self.unused = True self.children = [] # The first of the listed terms is used as the primary term for # storage (excepting for "special" config-only types). Due to # format restrictions, this form must not have e.g. space or # various special characters. if self.terms[0] not in cst.SPECIAL_RELATION_TYPES: self.__primary_term = normalize_to_storage_form(self.terms[0]) else: self.__primary_term = self.terms[0] # TODO: this might not be the ideal place to put this warning if self.__primary_term != self.terms[0]: Messager.warning( "Note: in configuration, term '%s' is not " "appropriate for storage (should match " "'^[a-zA-Z0-9_-]*$'), using '%s' instead. " "(Revise configuration file to get rid of " "this message. Terms other than the first are " "not subject to this restriction.)" % (self.terms[0], self.__primary_term), -1) self.terms[0] = self.__primary_term # TODO: cleaner and more localized parsing self.arguments = {} self.special_arguments = {} self.arg_list = [] self.arg_min_count = {} self.arg_max_count = {} self.keys_by_type = {} for a in self.args: self._process_arg(a, args)
def sentences(self): """ :returns: list of sentence tuple offsets """ ssplitter = self.configuration["sentence-splitter"] if ssplitter == 'newline': from arat.server.ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from arat.server.ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from arat.server.ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen return [o for o in ss_offset_gen(self.text)]
def store_svg(collection, document, svg): stored = [] _save_svg(collection, document, svg) stored.append({'name': 'svg', 'suffix': SVG_SUFFIX}) # attempt conversions from SVG to other formats try: from config import SVG_CONVERSION_COMMANDS except ImportError: SVG_CONVERSION_COMMANDS = [] for format_, command in SVG_CONVERSION_COMMANDS: try: from os import system svgfn = _svg_path() # TODO: assuming format name matches suffix; generalize outfn = svgfn.replace('.' + SVG_SUFFIX, '.' + format_) cmd = command % (svgfn, outfn) import logging logging.error(cmd) system(cmd) # TODO: this check may not work on all architectures. # consider rather checking is the intended output file # exists (don't forget to delete a possible old one # with the same name, though). # if retval != 0: # stored.append({'name': format, 'suffix': format}) # else: # Messager.warning("Failed conversion to %s" % format) # I'm getting weird return values from inkscape; will # just assume everything's OK ... # TODO: check return value, react appropriately stored.append({'name': format, 'suffix': format}) except: # pylint: disable=W0702 Messager.warning("Failed conversion to %s" % format) return {'stored': stored}
def get_normalization_config(self): norm_list = com.get_normalization_config_list(self.directory) norm_config = [] for n in norm_list: if 'DB' not in n.arguments: # optional, server looks in default location if None n.arguments['DB'] = [None] if '<URL>' not in n.special_arguments: Messager.warning( 'Project configuration: config error: missing <URL> ' 'specification for %s.' % n.storage_form()) continue if '<URLBASE>' not in n.special_arguments: # now optional, client skips link generation if None n.special_arguments['<URLBASE>'] = [None] norm_config.append((n.storage_form(), n.special_arguments['<URL>'][0], n.special_arguments['<URLBASE>'][0], n.arguments['DB'][0])) return norm_config
def __directory_relations_by_arg_num(directory, num, atype, include_special=False): assert num >= 0 and num < 2, "INTERNAL ERROR" rels = [] entity_types = set( [t.storage_form() for t in get_entity_type_list(directory)]) event_types = set( [t.storage_form() for t in get_event_type_list(directory)]) for r in get_relation_type_list(directory): # "Special" nesting relations ignored unless specifically # requested if r.storage_form( ) in cst.SPECIAL_RELATION_TYPES and not include_special: continue if len(r.arg_list) != 2: # Don't complain about argument constraints for unused relations if not r.unused: Messager.warning( "Relation type %s has %d arguments in " "configuration (%s; expected 2). Please fix " "configuration." % (r.storage_form(), len(r.arg_list), ",".join(r.arg_list))) else: types = r.arguments[r.arg_list[num]] for type_ in types: # TODO: there has to be a better way if (type_ == atype or type_ == "<ANY>" or atype == "<ANY>" or (type_ in entity_types and atype == "<ENTITY>") or (type_ in event_types and atype == "<EVENT>") or (atype in entity_types and type_ == "<ENTITY>") or (atype in event_types and type_ == "<EVENT>")): rels.append(r) # TODO: why not break here? return rels
def _get_option_by_storage_form(directory, term, config, cache): if directory not in cache: d = {} for n in config: t = n.storage_form() if t in d: Messager.warning( "Project configuration: %s appears multiple times, " "only using last. Configuration may be wrong." % t, 5) d[t] = {} for a in n.arguments: if len(n.arguments[a]) != 1: Messager.warning( "Project configuration: %s key %s has multiple " "values, only using first. Configuration may " "be wrong." % (t, a), 5) d[t][a] = n.arguments[a][0] cache[directory] = d return cache[directory].get(term, None)
def get_statistics(directory, base_names, use_cache=True): """ Check if we have a cache of the costly satistics generation Also, only use it if no file is newer than the cache itself """ cache_file_path = _get_stat_cache_by_dir(directory) try: cache_mtime = getmtime(cache_file_path) except OSError as exception: if exception.errno == 2: cache_mtime = -1 else: raise try: if _need_regeneration(directory, cache_file_path, cache_mtime): generate = True docstats = [] else: generate = False try: with open(cache_file_path, 'rb') as cache_file: docstats = pickle_load(cache_file) if len(docstats) != len(base_names): Messager.warning( 'Stats cache %s was incomplete; regenerating' % cache_file_path) generate = True docstats = [] except UnpicklingError: # Corrupt data, re-generate Messager.warning( 'Stats cache %s was corrupted; regenerating' % cache_file_path, -1) generate = True except EOFError: # Corrupt data, re-generate generate = True except OSError as exception: Messager.warning( 'Failed checking file modification times for stats cache check; regenerating' ) generate = True if not use_cache: generate = True # "header" and types stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")] if options_get_validation(directory) != 'none': stat_types.append(("Issues", "int")) if generate: stat_types, docstats = _generate_stats(directory, base_names, stat_types, cache_file_path) return stat_types, docstats
def test_tokeniser_by_name(self): """ tokeniser_by_name """ self.assertEqual(tokenise.tokeniser_by_name('whitespace'), tokenise.whitespace_token_boundary_gen) self.assertEqual(tokenise.tokeniser_by_name('ptblike'), tokenise.gtb_token_boundary_gen) # check that no messsage has been published self.assertEqual(Messager.output_json({}), {'messages': []}) # Any other name will returns default whitespace # and leave a message self.assertEqual(tokenise.tokeniser_by_name('invalid!'), tokenise.whitespace_token_boundary_gen) self.assertEqual( Messager.output_json({}), { 'messages': [('Unrecognized tokenisation option , ' 'reverting to whitespace ' 'tokenisation.', 'warning', 3)] })
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation tok_offset_gen = tokeniser_by_name(tokeniser) j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from arat.server.ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from arat.server.ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from arat.server.ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def tokeniser_by_name(name): """ load a tokenizer by name Available tokenizers: >>> tokeniser_by_name('whitespace') == whitespace_token_boundary_gen True >>> tokeniser_by_name('ptblike') == gtb_token_boundary_gen True Any other name will returns default whitespace >>> tokeniser_by_name('unknown') == whitespace_token_boundary_gen True """ if name in REGISTERED_TOKENISER: return REGISTERED_TOKENISER[name] Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') return whitespace_token_boundary_gen
def arc_types_from_to(self, from_ann, to_ann="<ANY>", include_special=False): """ Returns the possible arc types that can connect an annotation of type from_ann to an annotation of type to_ann. If to_ann has the value \"<ANY>\", returns all possible arc types. """ from_node = com.get_node_by_storage_form(self.directory, from_ann) if from_node is None: Messager.warning( "Project configuration: unknown textbound/event type %s. " "Configuration may be wrong." % from_ann) return [] if to_ann == "<ANY>": relations_from = com.get_relations_by_arg1( self.directory, from_ann, include_special) # TODO: consider using from_node.arg_list instead of .arguments for order return com.unique_preserve_order([role for role in from_node.arguments] + [r.storage_form() for r in relations_from]) # specific hits types = from_node.keys_by_type.get(to_ann, []) if "<ANY>" in from_node.keys_by_type: types += from_node.keys_by_type["<ANY>"] # generic arguments if self.is_event_type(to_ann) and '<EVENT>' in from_node.keys_by_type: types += from_node.keys_by_type['<EVENT>'] if self.is_physical_entity_type(to_ann) and '<ENTITY>' in from_node.keys_by_type: types += from_node.keys_by_type['<ENTITY>'] # relations types.extend(self.relation_types_from_to(from_ann, to_ann)) return com.unique_preserve_order(types)
def retrieve_stored(document, suffix): stored_path = _stored_path() + '.' + suffix if not isfile(stored_path): # @ninjin: not sure what 'version' was supposed to be returned # here, but none was defined, so returning that # raise NoSVGError(version) raise NoSVGError('None') filename = document + '.' + suffix # sorry, quick hack to get the content-type right # TODO: send this with initial 'stored' response instead of # guessing on suffix if suffix == SVG_SUFFIX: content_type = 'image/svg+xml' elif suffix == PNG_SUFFIX: content_type = 'image/png' elif suffix == PDF_SUFFIX: content_type = 'application/pdf' elif suffix == EPS_SUFFIX: content_type = 'application/postscript' else: Messager.error('Unknown suffix "%s"; cannot determine Content-Type' % suffix) # TODO: reasonable backoff value content_type = None # Bail out with a hack since we violated the protocol hdrs = [('Content-Type', content_type), ('Content-Disposition', 'inline; filename=' + filename)] with open(stored_path, 'rb') as stored_file: data = stored_file.read() return (hdrs, data)
def overlap_types(self, inner, outer): """ Returns the set of annotation overlap types that have been configured for the given pair of annotations. """ # TODO: this is O(NM) for relation counts N and M and goes # past much of the implemented caching. Might become a # bottleneck for annotations with large type systems. t1r = com.get_relations_by_arg1(self.directory, inner, True) t2r = com.get_relations_by_arg2(self.directory, outer, True) types = [] for r in (s for s in t1r if s.storage_form() in cst.SPECIAL_RELATION_TYPES): if r in t2r: types.append(r) # new-style overlap configuration ("<OVERLAP>") takes precedence # over old-style configuration ("ENTITY-NESTING"). ovl_types = set() ovl = [r for r in types if r.storage_form() == cst.TEXTBOUND_OVERLAP_TYPE] nst = [r for r in types if r.storage_form() == cst.ENTITY_NESTING_TYPE] if ovl: if nst: Messager.warning('Warning: both '+cst.TEXTBOUND_OVERLAP_TYPE + ' and '+cst.ENTITY_NESTING_TYPE+' defined for ' + '('+inner+','+outer+') in config. ' + 'Ignoring latter.') for r in ovl: if cst.OVERLAP_TYPE_ARG not in r.special_arguments: Messager.warning('Warning: missing '+cst.OVERLAP_TYPE_ARG + ' for '+cst.TEXTBOUND_OVERLAP_TYPE + ', ignoring specification.') continue for val in r.special_arguments[cst.OVERLAP_TYPE_ARG]: ovl_types |= set(val.split('|')) elif nst: # translate into new-style configuration ovl_types = set(['contain']) else: ovl_types = set() undefined_types = [t for t in ovl_types if t not in ('contain', 'equal', 'cross', '<ANY>')] if undefined_types: Messager.warning('Undefined '+cst.OVERLAP_TYPE_ARG+' value(s) ' + str(undefined_types)+' for ' + '('+inner+','+outer+') in config. ') return ovl_types