Esempio n. 1
0
    def attributes_for(self, ann_type):
        """
        Returs a list of the possible attribute types for an
        annotation of the given type.
        """
        attrs = []
        for attr in com.get_attribute_type_list(self.directory):
            if attr == cst.SEPARATOR_STR:
                continue

            if 'Arg' not in attr.arguments:
                Messager.warning(
                    "Project configuration: config error: attribute '%s' "
                    "lacks 'Arg:' specification." % attr.storage_form())
                continue

            types = attr.arguments['Arg']

            if ((ann_type in types) or ('<ANY>' in types) or
                (self.is_event_type(ann_type) and '<EVENT>' in types) or
                (self.is_physical_entity_type(ann_type) and '<ENTITY>' in types)
                or
                    (self.is_relation_type(ann_type) and '<RELATION>' in types)):
                attrs.append(attr.storage_form())

        return attrs
Esempio n. 2
0
    def test_05_output(self):
        """
        test ouput of pending messages
        """
        Messager.warning(u'Hello warning')
        Messager.info(u'Hello info')
        Messager.debug(u'Hello debug')

        Messager.error(u'Hello error')
        output = NamedTemporaryFile("w", delete=False)
        try:
            Messager.output(output)

            output.close()
            with open(output.name, "r") as output:
                self.assertEqual(
                    output.read(), u"warning : Hello warning\n"
                    u"comment : Hello info\n"
                    u'debug : Hello debug\n'
                    u'error : Hello error\n')
            Messager.clear()

            with open(output.name, "w") as output:
                Messager.output(output)
            with open(output.name, "r") as output:
                self.assertEqual(output.read(), "")
        finally:
            os.unlink(output.name)
Esempio n. 3
0
 def wrapper(*args, **kwds):
     """
     Add message sending to func
     """
     if DEBUG:
         Messager.warning(
             ('Client sent "%s" action '
              'which is marked as deprecated') % func.__name__, )
     return func(*args, **kwds)
Esempio n. 4
0
 def test_01_warning(self):
     """
     test warning level
     """
     Messager.warning(u'Hello 世界!')
     json_dic = {}
     Messager.output_json(json_dic)
     self.assertEqual(
         json_dic,
         {'messages': [(u'Hello \u4e16\u754c\uff01', 'warning', 3)]})
Esempio n. 5
0
def get_labels(directory):
    l = {}
    for t in get_visual_configs(directory)[0][cst.LABEL_SECTION]:
        if t.storage_form() in l:
            Messager.warning(
                "In configuration, labels for '%s' defined more "
                "than once. Only using the last set." % t.storage_form(), -1)
        # first is storage for, rest are labels.
        l[t.storage_form()] = t.terms[1:]
    return l
Esempio n. 6
0
def get_configs(directory, filename, defaultstr, minconf, sections,
                optional_sections):
    if (directory, filename) not in _GET_CONFIGS_CACHE:
        configstr, source = __read_first_in_directory_tree(directory, filename)

        if configstr is None:
            # didn't get one; try default dir and fall back to the default
            configstr = __read_or_default(filename, defaultstr)
            if configstr == defaultstr:
                Messager.info(
                    "Project configuration: no configuration file (%s) "
                    "found, using default." % filename, 5)
                source = "[default]"
            else:
                source = filename

        # try to parse what was found, fall back to minimal config
        try:
            configs, section_labels = __parse_configs(configstr, source,
                                                      sections,
                                                      optional_sections)
        except InvalidProjectConfigException:
            Messager.warning(
                "Project configuration: Falling back to minimal default. "
                "Configuration is likely wrong.", 5)
            configs = minconf
            section_labels = dict([(a, a) for a in sections])

        # very, very special case processing: if we have a type
        # "Equiv" defined in a "relations" section that doesn't
        # specify a "<REL-TYPE>", automatically fill "symmetric" and
        # "transitive". This is to support older configurations that
        # rely on the type "Equiv" to identify the relation as an
        # equivalence.
        if 'relations' in configs:
            for r in configs['relations']:
                if r == cst.SEPARATOR_STR:
                    continue
                if (r.storage_form() == "Equiv"
                        and "<REL-TYPE>" not in r.special_arguments):
                    # this was way too much noise; will only add in after
                    # at least most configs are revised.
                    # Messager.warning('Note: "Equiv" defined in config '
                    #                 'without "<REL-TYPE>"; assuming '
                    #                 'symmetric and transitive. Consider '
                    #                 'revising config to add '
                    #                 '"<REL-TYPE>:symmetric-transitive" '
                    #                 'to definition.')
                    r.special_arguments["<REL-TYPE>"] = [
                        "symmetric", "transitive"
                    ]

        _GET_CONFIGS_CACHE[(directory, filename)] = (configs, section_labels)

    return _GET_CONFIGS_CACHE[(directory, filename)]
Esempio n. 7
0
 def get_search_config(self):
     search_config = []
     for r in com.get_search_config_list(self.directory):
         if '<URL>' not in r.special_arguments:
             Messager.warning(
                 'Project configuration: config error: missing <URL> '
                 'specification for %s search.' % r.storage_form())
         else:
             search_config.append(
                 (r.storage_form(), r.special_arguments['<URL>'][0]))
     return search_config
Esempio n. 8
0
 def argument_minimum_count(self, atype, arg):
     """
     Returns the minimum number of times that the given argument is
     allowed to be filled for an annotation of the given type.
     """
     node = com.get_node_by_storage_form(self.directory, atype)
     if node is None:
         Messager.warning(
             "Project configuration: unknown event type %s. "
             "Configuration may be wrong." % atype)
         return 0
     return node.argument_minimum_count(arg)
Esempio n. 9
0
 def multiple_allowed_arguments(self, atype):
     """
     Returns the argument types that are allowed to be filled more
     than once for an annotation of the given type.
     """
     node = com.get_node_by_storage_form(self.directory, atype)
     if node is None:
         Messager.warning(
             "Project configuration: unknown event type %s. "
             "Configuration may be wrong." % atype)
         return []
     return node.multiple_allowed_arguments()
Esempio n. 10
0
 def mandatory_arguments(self, atype):
     """
     Returns the mandatory argument types that must be present for
     an annotation of the given type.
     """
     node = com.get_node_by_storage_form(self.directory, atype)
     if node is None:
         Messager.warning(
             "Project configuration: unknown event type %s. "
             "Configuration may be wrong." % atype)
         return []
     return node.mandatory_arguments()
Esempio n. 11
0
def _store_cache_stat(docstats, cache_file_path, directory):
    """
    Cache the statistics
    """
    try:
        with open(cache_file_path, 'wb') as cache_file:
            pickle_dump(docstats,
                        cache_file,
                        protocol=constants.PICKLE_PROTOCOL)
    except IOError as exception:
        Messager.warning(
            "Could not write statistics cache file to directory %s: %s" %
            (directory, exception))
Esempio n. 12
0
def __parse_kb_shortcuts(shortcutstr, default, source):
    shortcuts = {}
    for l in shortcutstr.split("\n"):
        l = l.strip()
        if l == "" or l[:1] == "#":
            continue
        key, type_ = re.split(r'[ \t]+', l)
        if key in shortcuts:
            Messager.warning("Project configuration: keyboard shortcut "
                             "for '%s' defined multiple times. Ignoring "
                             "all but first ('%s')" % (key, shortcuts[key]))
        else:
            shortcuts[key] = type_
    return shortcuts
Esempio n. 13
0
def get_node_by_storage_form(directory, term):
    if directory not in _GET_NODE_BY_STORAGE_FORM_CACHE:
        d = {}
        for e in get_entity_type_list(directory) + get_event_type_list(
                directory):
            t = e.storage_form()
            if t in d:
                Messager.warning(
                    "Project configuration: term %s appears multiple times, "
                    "only using last. Configuration may be wrong." % t, 5)
            d[t] = e
        _GET_NODE_BY_STORAGE_FORM_CACHE[directory] = d

    return _GET_NODE_BY_STORAGE_FORM_CACHE[directory].get(term, None)
Esempio n. 14
0
    def __init__(self, terms, args=None):
        if args is None:
            args = []
        self.terms, self.args = terms, args

        if not terms or len([t for t in terms if t == ""]) != 0:
            Messager.debug("Empty term in configuration", duration=-1)
            raise InvalidProjectConfigException

        # unused if any of the terms marked with "!"
        self.unused = False
        for i in range(len(self.terms)):
            if self.terms[i][0] == "!":
                self.terms[i] = self.terms[i][1:]
                self.unused = True
        self.children = []

        # The first of the listed terms is used as the primary term for
        # storage (excepting for "special" config-only types). Due to
        # format restrictions, this form must not have e.g. space or
        # various special characters.
        if self.terms[0] not in cst.SPECIAL_RELATION_TYPES:
            self.__primary_term = normalize_to_storage_form(self.terms[0])
        else:
            self.__primary_term = self.terms[0]
        # TODO: this might not be the ideal place to put this warning
        if self.__primary_term != self.terms[0]:
            Messager.warning(
                "Note: in configuration, term '%s' is not "
                "appropriate for storage (should match "
                "'^[a-zA-Z0-9_-]*$'), using '%s' instead. "
                "(Revise configuration file to get rid of "
                "this message. Terms other than the first are "
                "not subject to this restriction.)" %
                (self.terms[0], self.__primary_term), -1)
            self.terms[0] = self.__primary_term

        # TODO: cleaner and more localized parsing
        self.arguments = {}
        self.special_arguments = {}
        self.arg_list = []
        self.arg_min_count = {}
        self.arg_max_count = {}
        self.keys_by_type = {}
        for a in self.args:
            self._process_arg(a, args)
Esempio n. 15
0
    def sentences(self):
        """
        :returns: list of sentence tuple offsets
        """
        ssplitter = self.configuration["sentence-splitter"]
        if ssplitter == 'newline':
            from arat.server.ssplit import newline_sentence_boundary_gen
            ss_offset_gen = newline_sentence_boundary_gen
        elif ssplitter == 'regex':
            from arat.server.ssplit import regex_sentence_boundary_gen
            ss_offset_gen = regex_sentence_boundary_gen
        else:
            Messager.warning('Unrecognized sentence splitting option '
                             ', reverting to newline sentence splitting.')
            from arat.server.ssplit import newline_sentence_boundary_gen
            ss_offset_gen = newline_sentence_boundary_gen

        return [o for o in ss_offset_gen(self.text)]
Esempio n. 16
0
def store_svg(collection, document, svg):
    stored = []

    _save_svg(collection, document, svg)
    stored.append({'name': 'svg', 'suffix': SVG_SUFFIX})

    # attempt conversions from SVG to other formats
    try:
        from config import SVG_CONVERSION_COMMANDS
    except ImportError:
        SVG_CONVERSION_COMMANDS = []

    for format_, command in SVG_CONVERSION_COMMANDS:
        try:
            from os import system

            svgfn = _svg_path()
            # TODO: assuming format name matches suffix; generalize
            outfn = svgfn.replace('.' + SVG_SUFFIX, '.' + format_)
            cmd = command % (svgfn, outfn)

            import logging
            logging.error(cmd)

            system(cmd)

            # TODO: this check may not work on all architectures.
            # consider rather checking is the intended output file
            # exists (don't forget to delete a possible old one
            # with the same name, though).
            #             if retval != 0:
            #                 stored.append({'name': format, 'suffix': format})
            #             else:
            #                 Messager.warning("Failed conversion to %s" % format)
            # I'm getting weird return values from inkscape; will
            # just assume everything's OK ...
            # TODO: check return value, react appropriately
            stored.append({'name': format, 'suffix': format})

        except:  # pylint: disable=W0702
            Messager.warning("Failed conversion to %s" % format)

    return {'stored': stored}
Esempio n. 17
0
 def get_normalization_config(self):
     norm_list = com.get_normalization_config_list(self.directory)
     norm_config = []
     for n in norm_list:
         if 'DB' not in n.arguments:
             # optional, server looks in default location if None
             n.arguments['DB'] = [None]
         if '<URL>' not in n.special_arguments:
             Messager.warning(
                 'Project configuration: config error: missing <URL> '
                 'specification for %s.' % n.storage_form())
             continue
         if '<URLBASE>' not in n.special_arguments:
             # now optional, client skips link generation if None
             n.special_arguments['<URLBASE>'] = [None]
         norm_config.append((n.storage_form(),
                             n.special_arguments['<URL>'][0],
                             n.special_arguments['<URLBASE>'][0],
                             n.arguments['DB'][0]))
     return norm_config
Esempio n. 18
0
def __directory_relations_by_arg_num(directory,
                                     num,
                                     atype,
                                     include_special=False):
    assert num >= 0 and num < 2, "INTERNAL ERROR"

    rels = []

    entity_types = set(
        [t.storage_form() for t in get_entity_type_list(directory)])
    event_types = set(
        [t.storage_form() for t in get_event_type_list(directory)])

    for r in get_relation_type_list(directory):
        # "Special" nesting relations ignored unless specifically
        # requested
        if r.storage_form(
        ) in cst.SPECIAL_RELATION_TYPES and not include_special:
            continue

        if len(r.arg_list) != 2:
            # Don't complain about argument constraints for unused relations
            if not r.unused:
                Messager.warning(
                    "Relation type %s has %d arguments in "
                    "configuration (%s; expected 2). Please fix "
                    "configuration." %
                    (r.storage_form(), len(r.arg_list), ",".join(r.arg_list)))
        else:
            types = r.arguments[r.arg_list[num]]
            for type_ in types:
                # TODO: there has to be a better way
                if (type_ == atype or type_ == "<ANY>" or atype == "<ANY>"
                        or (type_ in entity_types and atype == "<ENTITY>")
                        or (type_ in event_types and atype == "<EVENT>")
                        or (atype in entity_types and type_ == "<ENTITY>")
                        or (atype in event_types and type_ == "<EVENT>")):
                    rels.append(r)
                    # TODO: why not break here?

    return rels
Esempio n. 19
0
def _get_option_by_storage_form(directory, term, config, cache):
    if directory not in cache:
        d = {}
        for n in config:
            t = n.storage_form()
            if t in d:
                Messager.warning(
                    "Project configuration: %s appears multiple times, "
                    "only using last. Configuration may be wrong." % t, 5)
            d[t] = {}
            for a in n.arguments:
                if len(n.arguments[a]) != 1:
                    Messager.warning(
                        "Project configuration: %s key %s has multiple "
                        "values, only using first. Configuration may "
                        "be wrong." % (t, a), 5)
                d[t][a] = n.arguments[a][0]

        cache[directory] = d

    return cache[directory].get(term, None)
Esempio n. 20
0
def get_statistics(directory, base_names, use_cache=True):
    """
    Check if we have a cache of the costly satistics generation
    Also, only use it if no file is newer than the cache itself
    """
    cache_file_path = _get_stat_cache_by_dir(directory)

    try:
        cache_mtime = getmtime(cache_file_path)
    except OSError as exception:
        if exception.errno == 2:
            cache_mtime = -1
        else:
            raise

    try:
        if _need_regeneration(directory, cache_file_path, cache_mtime):
            generate = True
            docstats = []
        else:
            generate = False
            try:
                with open(cache_file_path, 'rb') as cache_file:
                    docstats = pickle_load(cache_file)
                if len(docstats) != len(base_names):
                    Messager.warning(
                        'Stats cache %s was incomplete; regenerating' %
                        cache_file_path)
                    generate = True
                    docstats = []
            except UnpicklingError:
                # Corrupt data, re-generate
                Messager.warning(
                    'Stats cache %s was corrupted; regenerating' %
                    cache_file_path, -1)
                generate = True
            except EOFError:
                # Corrupt data, re-generate
                generate = True
    except OSError as exception:
        Messager.warning(
            'Failed checking file modification times for stats cache check; regenerating'
        )
        generate = True

    if not use_cache:
        generate = True

    # "header" and types
    stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")]

    if options_get_validation(directory) != 'none':
        stat_types.append(("Issues", "int"))

    if generate:
        stat_types, docstats = _generate_stats(directory, base_names,
                                               stat_types, cache_file_path)

    return stat_types, docstats
Esempio n. 21
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    j_dic['text'] = text

    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    tok_offset_gen = tokeniser_by_name(tokeniser)
    j_dic['token_offsets'] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == 'newline':
        from arat.server.ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == 'regex':
        from arat.server.ssplit import regex_sentence_boundary_gen
        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning('Unrecognized sentence splitting option '
                         ', reverting to newline sentence splitting.')
        from arat.server.ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)]

    return True
Esempio n. 22
0
def tokeniser_by_name(name):
    """
    load a tokenizer by name

    Available tokenizers:

    >>> tokeniser_by_name('whitespace') == whitespace_token_boundary_gen
    True

    >>> tokeniser_by_name('ptblike') == gtb_token_boundary_gen
    True

    Any other name will returns default whitespace

    >>> tokeniser_by_name('unknown') == whitespace_token_boundary_gen
    True
    """
    if name in REGISTERED_TOKENISER:
        return REGISTERED_TOKENISER[name]

    Messager.warning('Unrecognized tokenisation option '
                     ', reverting to whitespace tokenisation.')
    return whitespace_token_boundary_gen
Esempio n. 23
0
    def arc_types_from_to(self, from_ann, to_ann="<ANY>", include_special=False):
        """
        Returns the possible arc types that can connect an annotation
        of type from_ann to an annotation of type to_ann.
        If to_ann has the value \"<ANY>\", returns all possible arc types.
        """

        from_node = com.get_node_by_storage_form(self.directory, from_ann)

        if from_node is None:
            Messager.warning(
                "Project configuration: unknown textbound/event type %s. "
                "Configuration may be wrong." % from_ann)
            return []

        if to_ann == "<ANY>":
            relations_from = com.get_relations_by_arg1(
                self.directory, from_ann, include_special)
            # TODO: consider using from_node.arg_list instead of .arguments for order
            return com.unique_preserve_order([role for role in from_node.arguments] + [r.storage_form() for r in relations_from])

        # specific hits
        types = from_node.keys_by_type.get(to_ann, [])

        if "<ANY>" in from_node.keys_by_type:
            types += from_node.keys_by_type["<ANY>"]

        # generic arguments
        if self.is_event_type(to_ann) and '<EVENT>' in from_node.keys_by_type:
            types += from_node.keys_by_type['<EVENT>']
        if self.is_physical_entity_type(to_ann) and '<ENTITY>' in from_node.keys_by_type:
            types += from_node.keys_by_type['<ENTITY>']

        # relations
        types.extend(self.relation_types_from_to(from_ann, to_ann))

        return com.unique_preserve_order(types)
Esempio n. 24
0
    def overlap_types(self, inner, outer):
        """
        Returns the set of annotation overlap types that have been
        configured for the given pair of annotations.
        """
        # TODO: this is O(NM) for relation counts N and M and goes
        # past much of the implemented caching. Might become a
        # bottleneck for annotations with large type systems.
        t1r = com.get_relations_by_arg1(self.directory, inner, True)
        t2r = com.get_relations_by_arg2(self.directory, outer, True)

        types = []
        for r in (s for s in t1r
                  if s.storage_form() in cst.SPECIAL_RELATION_TYPES):
            if r in t2r:
                types.append(r)

        # new-style overlap configuration ("<OVERLAP>") takes precedence
        # over old-style configuration ("ENTITY-NESTING").
        ovl_types = set()

        ovl = [r for r in types
               if r.storage_form() == cst.TEXTBOUND_OVERLAP_TYPE]
        nst = [r for r in types
               if r.storage_form() == cst.ENTITY_NESTING_TYPE]

        if ovl:
            if nst:
                Messager.warning('Warning: both '+cst.TEXTBOUND_OVERLAP_TYPE +
                                 ' and '+cst.ENTITY_NESTING_TYPE+' defined for ' +
                                 '('+inner+','+outer+') in config. ' +
                                 'Ignoring latter.')
            for r in ovl:
                if cst.OVERLAP_TYPE_ARG not in r.special_arguments:
                    Messager.warning('Warning: missing '+cst.OVERLAP_TYPE_ARG +
                                     ' for '+cst.TEXTBOUND_OVERLAP_TYPE +
                                     ', ignoring specification.')
                    continue
                for val in r.special_arguments[cst.OVERLAP_TYPE_ARG]:
                    ovl_types |= set(val.split('|'))
        elif nst:
            # translate into new-style configuration
            ovl_types = set(['contain'])
        else:
            ovl_types = set()

        undefined_types = [t for t in ovl_types if
                           t not in ('contain', 'equal', 'cross', '<ANY>')]
        if undefined_types:
            Messager.warning('Undefined '+cst.OVERLAP_TYPE_ARG+' value(s) ' +
                             str(undefined_types)+' for ' +
                             '('+inner+','+outer+') in config. ')
        return ovl_types
Esempio n. 25
0
def __parse_configs(configstr, source, expected_sections, optional_sections):
    # top-level config structure is a set of term hierarchies
    # separated by lines consisting of "[SECTION]" where SECTION is
    # e.g.  "entities", "relations", etc.

    # start by splitting config file lines by section, also storing
    # the label (default name or alias) used for each section.

    section = "general"
    section_lines = {section: []}
    section_labels = {}
    for l in configstr.split("\n"):
        m = re.match(r'^\s*\[(.*)\]\s*$', l)
        if m:
            section = m.group(1)

            # map and store section name/alias (e.g. "spans" -> "entities")
            section_name = cst.SECTION_ALIAS.get(section, section)
            section_labels[section_name] = section
            section = section_name

            if section not in expected_sections:
                Messager.warning(
                    "Project configuration: unexpected section "
                    "[%s] in %s. Ignoring contents." % (section, source), 5)
            if section not in section_lines:
                section_lines[section] = []
        else:
            section_lines[section].append(l)

    # attempt to parse lines in each section as a term hierarchy
    configs = {}
    for s, sl in section_lines.items():
        try:
            configs[s] = __read_term_hierarchy(sl, s)
        except Exception as e:
            Messager.warning(
                "Project configuration: error parsing section "
                "[%s] in %s: %s" % (s, source, str(e)), 5)
            raise

    # verify that expected sections are present; replace with empty if not.
    for s in expected_sections:
        if s not in configs:
            if s not in optional_sections:
                Messager.warning(
                    "Project configuration: missing section [%s] in %s. "
                    "Configuration may be wrong." % (s, source), 5)
            configs[s] = []

    return (configs, section_labels)
Esempio n. 26
0
 def _get_tool_config(self, tool_list):
     tool_config = []
     for r in tool_list:
         if '<URL>' not in r.special_arguments:
             Messager.warning(
                 'Project configuration: config error: missing <URL> '
                 'specification for %s.' % r.storage_form())
             continue
         if 'tool' not in r.arguments:
             Messager.warning(
                 'Project configuration: config error: missing tool '
                 'name ("tool") for %s.' % r.storage_form())
             continue
         if 'model' not in r.arguments:
             Messager.warning(
                 'Project configuration: config error: missing model '
                 'name ("model") for %s.' % r.storage_form())
             continue
         tool_config.append((r.storage_form(),
                             r.arguments['tool'][0],
                             r.arguments['model'][0],
                             r.special_arguments['<URL>'][0]))
     return tool_config
Esempio n. 27
0
    def _process_arg(self, arg, args):
        arg = arg.strip()
        match_obj = re.match(r'^(\S*?):(\S*)$', arg)
        if not match_obj:
            Messager.warning(
                "Project configuration: Failed to parse argument "
                "'%s' (args: %s)" % (arg, args), 5)
            raise InvalidProjectConfigException
        key, atypes = match_obj.groups()

        # special case (sorry): if the key is a reserved config
        # string (e.g. "<REL-TYPE>" or "<URL>"), parse differently
        # and store separately
        if key in cst.RESERVED_CONFIG_STRING:
            if key is self.special_arguments:
                Messager.warning(
                    "Project configuration: error parsing: %s argument "
                    "'%s' appears multiple times." % key, 5)
                raise InvalidProjectConfigException
            # special case in special case: relation type specifications
            # are split by hyphens, nothing else is.
            # (really sorry about this.)
            if key == "<REL-TYPE>":
                self.special_arguments[key] = atypes.split("-")
            else:
                self.special_arguments[key] = [atypes]
            # NOTE: skip the rest of processing -- don't add in normal args
            return

        # Parse "repetition" modifiers. These are regex-like:
        # - Arg      : mandatory argument, exactly one
        # - Arg?     : optional argument, at most one
        # - Arg*     : optional argument, any number
        # - Arg+     : mandatory argument, one or more
        # - Arg{N}   : mandatory, exactly N
        # - Arg{N-M} : mandatory, between N and M

        match_obj = re.match(r'^(\S+?)(\{\S+\}|\?|\*|\+|)$', key)
        if not match_obj:
            Messager.warning(
                "Project configuration: error parsing "
                "argument '%s'." % key, 5)
            raise InvalidProjectConfigException
        key, rep = match_obj.groups()

        if rep == '':
            # exactly one
            minimum_count = 1
            maximum_count = 1
        elif rep == '?':
            # zero or one
            minimum_count = 0
            maximum_count = 1
        elif rep == '*':
            # any number
            minimum_count = 0
            maximum_count = sys.maxsize
        elif rep == '+':
            # one or more
            minimum_count = 1
            maximum_count = sys.maxsize
        else:
            # exact number or range constraint
            assert '{' in rep and '}' in rep, "INTERNAL ERROR"
            m = re.match(r'\{(\d+)(?:-(\d+))?\}$', rep)
            if not m:
                Messager.warning(
                    "Project configuration: error parsing range '%s' in "
                    "argument '%s' (syntax is "
                    "'{MIN-MAX}')." % (rep, key + rep), 5)
                raise InvalidProjectConfigException
            n1, n2 = m.groups()
            n1 = int(n1)
            if n2 is None:
                # exact number
                if n1 == 0:
                    Messager.warning(
                        "Project configuration: cannot have exactly "
                        "0 repetitions of argument '%s'." % (key + rep), 5)
                    raise InvalidProjectConfigException
                minimum_count = n1
                maximum_count = n1
            else:
                # range
                n2 = int(n2)
                if n1 > n2:
                    Messager.warning(
                        "Project configuration: invalid range %d-%d "
                        "for argument '%s'." % (n1, n2, key + rep), 5)
                    raise InvalidProjectConfigException
                minimum_count = n1
                maximum_count = n2

        # format / config sanity: an argument whose label ends
        # with a digit label cannot be repeated, as this would
        # introduce ambiguity into parsing. (For example, the
        # second "Theme" is "Theme2", and the second "Arg1" would
        # be "Arg12".)
        if maximum_count > 1 and key[-1].isdigit():
            Messager.warning(
                "Project configuration: error parsing: arguments ending "
                "with a digit cannot be repeated: '%s'" % (key + rep), 5)
            raise InvalidProjectConfigException

        if key in self.arguments:
            Messager.warning(
                "Project configuration: error parsing: %s argument '%s' "
                "appears multiple times." % key, 5)
            raise InvalidProjectConfigException

        assert (key not in self.arg_min_count
                and key not in self.arg_max_count), "INTERNAL ERROR"
        self.arg_min_count[key] = minimum_count
        self.arg_max_count[key] = maximum_count

        self.arg_list.append(key)

        for atype in atypes.split("|"):
            if atype.strip() == "":
                Messager.warning(
                    "Project configuration: error parsing: empty type for "
                    "argument '%s'." % arg, 5)
                raise InvalidProjectConfigException

            # Check disabled; need to support arbitrary UTF values
            # for visual.conf. TODO: add this check for other configs.
            # TODO: consider checking for similar for appropriate confs.
#                 if atype not in RESERVED_CONFIG_STRING and
#                           normalize_to_storage_form(atype) != atype:
#                     Messager.warning("Project configuration: '%s' "
#                                      "is not a valid argument (should "
#                                      "match '^[a-zA-Z0-9_-]*$')" % atype, 5)
#                     raise InvalidProjectConfigException

            if key not in self.arguments:
                self.arguments[key] = []
            self.arguments[key].append(atype)

            if atype not in self.keys_by_type:
                self.keys_by_type[atype] = []
            self.keys_by_type[atype].append(key)
Esempio n. 28
0
def get_drawing_config_by_storage_form(directory, term):
    cache = get_drawing_config_by_storage_form.__cache
    if directory not in cache:
        d = {}
        for n in get_drawing_config(directory):
            t = n.storage_form()
            if t in d:
                Messager.warning(
                    "Project configuration: term %s appears multiple times, "
                    "only using last. Configuration may be wrong." % t, 5)
            d[t] = {}
            for a in n.arguments:
                # attribute drawing can be specified with multiple
                # values (multi-valued attributes), other parts of
                # drawing config should have single values only.
                if len(n.arguments[a]) != 1:
                    if a in cst.ATTR_DRAWING_ATTRIBUTES:
                        # use multi-valued directly
                        d[t][a] = n.arguments[a]
                    else:
                        # warn and pass
                        Messager.warning("Project configuration: expected "
                                         "single value for %s argument %s, "
                                         "got '%s'. Configuration may be "
                                         "wrong." %
                                         (t, a, "|".join(n.arguments[a])))
                else:
                    d[t][a] = n.arguments[a][0]

        # TODO: hack to get around inability to have commas in values;
        # fix original issue instead
        for t in d:
            for k in d[t]:
                # sorry about this
                if not isinstance(d[t][k], list):
                    d[t][k] = d[t][k].replace("-", ",")
                else:
                    for i in range(len(d[t][k])):
                        d[t][k][i] = d[t][k][i].replace("-", ",")

        default_keys = [
            cst.VISUAL_SPAN_DEFAULT, cst.VISUAL_ARC_DEFAULT,
            cst.VISUAL_ATTR_DEFAULT
        ]
        for default_dict in [d.get(dk, {}) for dk in default_keys]:
            for k in default_dict:
                for t in d:
                    d[t][k] = d[t].get(k, default_dict[k])

        # Kind of a special case: recognize <NONE> as "deleting" an
        # attribute (prevents default propagation) and <EMPTY> as
        # specifying that a value should be the empty string
        # (can't be written as such directly).
        for t in d:
            todelete = [k for k in d[t] if d[t][k] == '<NONE>']
            for k in todelete:
                del d[t][k]

            for k in d[t]:
                if d[t][k] == '<EMPTY>':
                    d[t][k] = ''

        cache[directory] = d

    return cache[directory].get(term, None)
Esempio n. 29
0
def _fill_attribute_configuration(nodes, project_conf):
    items = []
    for node in nodes:
        if node == SEPARATOR_STR:
            continue
        else:
            item = {}
            _type = node.storage_form()
            item['name'] = project_conf.preferred_display_form(_type)
            item['type'] = _type
            item['unused'] = node.unused
            item['labels'] = project_conf.get_labels_by_type(_type)

            attr_drawing_conf = project_conf.get_drawing_config_by_type(_type)
            if attr_drawing_conf is None:
                attr_drawing_conf = project_conf.get_drawing_config_by_type(
                    VISUAL_ATTR_DEFAULT)
            if attr_drawing_conf is None:
                attr_drawing_conf = {}

            # Check if the possible values for the argument are specified
            # TODO: avoid magic strings
            if "Value" in node.arguments:
                args = node.arguments["Value"]
            else:
                # no "Value" defined; assume binary.
                args = []

            # Check if a default value is specified for the attribute
            if '<DEFAULT>' in node.special_arguments:
                try:
                    item['default'] = node.special_arguments['<DEFAULT>'][0]
                except IndexError:
                    Messager.warning("Config error: empty <DEFAULT> for %s" %
                                     item['name'])

            # Each item's 'values' entry is a list of dictionaries, one
            # dictionary per value option.
            if len(args) == 0:
                # binary; use drawing config directly
                attr_values = {'name': _type}
                for k in ATTR_DRAWING_ATTRIBUTES:
                    if k in attr_drawing_conf:
                        # protect against error from binary attribute
                        # having multi-valued visual config (#698)
                        if isinstance(attr_drawing_conf[k], list):
                            Messager.warning(
                                "Visual config error: expected single value for %s binary attribute '%s' config, found %d. Visuals may be wrong."
                                % (_type, k, len(attr_drawing_conf[k])))
                            # fall back on the first just to have something.
                            attr_values[k] = attr_drawing_conf[k][0]
                        else:
                            attr_values[k] = attr_drawing_conf[k]
                item['values'] = [attr_values]
            else:
                # has normal arguments, use these as possible values.
                # (this is quite terrible all around, sorry.)
                # we'll populate this incrementally as we process the args
                item['values'] = []
                for i, v in enumerate(args):
                    attr_values = {'name': v}

                    # match up annotation config with drawing config by
                    # position in list of alternative values so that e.g.
                    # "Values:L1|L2|L3" can have the visual config
                    # "glyph:[1]|[2]|[3]". If only a single value is
                    # defined, apply to all.
                    for k in ATTR_DRAWING_ATTRIBUTES:
                        if k in attr_drawing_conf:
                            # (sorry about this)
                            if isinstance(attr_drawing_conf[k], list):
                                # sufficiently many specified?
                                if len(attr_drawing_conf[k]) > i:
                                    attr_values[k] = attr_drawing_conf[k][i]
                                else:
                                    Messager.warning(
                                        "Visual config error: expected %d values for %s attribute '%s' config, found only %d. Visuals may be wrong."
                                        % (len(args), v, k,
                                           len(attr_drawing_conf[k])))
                            else:
                                # single value (presumably), apply to all
                                attr_values[k] = attr_drawing_conf[k]

                    # if no drawing attribute was defined, fall back to
                    # using a glyph derived from the attribute value
                    if len([
                            k for k in ATTR_DRAWING_ATTRIBUTES
                            if k in attr_values
                    ]) == 0:
                        attr_values['glyph'] = '[' + v + ']'

                    item['values'].append(attr_values)

            items.append(item)
    return items
Esempio n. 30
0
def __read_term_hierarchy(input_, section=None):
    """

    Output a list of TypeHierarchyNode

    >>> _input = ["# This a comment to be ignored"]
    >>> _input.append("[spans]")
    >>> _input.append("# POS tags")
    >>> _input.append("adj")
    >>> _input.append("adv")
    >>> _input.append("art")

    >>> isinstance((__read_term_hierarchy("\\n".join(_input))[0]), TypeHierarchyNode)
    True

    """

    root_nodes = []
    last_node_at_depth = {}
    last_args_at_depth = {}

    macros = {}
    for line in input_:
        # skip empties and lines starting with '#'
        if line.strip() == '' or re.match(r'^\s*#', line):
            continue

        # interpret lines of only hyphens as separators
        # for display
        if re.match(r'^\s*-+\s*$', line):
            # TODO: proper placeholder and placing
            root_nodes.append(cst.SEPARATOR_STR)
            continue

        # interpret lines of the format <STR1>=STR2 as "macro"
        # definitions, defining <STR1> as a placeholder that should be
        # replaced with STR2 whevever it occurs.
        match_obj = re.match(r'^<([a-zA-Z_-]+)>=\s*(.*?)\s*$', line)
        if match_obj:
            name, value = match_obj.groups()
            if name in cst.RESERVED_CONFIG_NAME:
                Messager.error("Cannot redefine <%s> in configuration, "
                               "it is a reserved name." % name)
                # TODO: proper exception
                raise InvalidProjectConfigException("Reserved name: " + name)
            else:
                macros["<%s>" % name] = value
            continue

        # macro expansion
        for token in macros:
            line = line.replace(token, macros[token])

        # check for undefined macros
        for match_obj in re.finditer(r'(<.*?>)', line):
            token = match_obj.group(1)
            assert token in cst.RESERVED_CONFIG_STRING, (
                "Error: undefined macro %s "
                "in configuration. (Note that macros are section-specific.)"
            ) % token

        # choose strict tab-only separator or looser any-space
        # separator matching depending on section
        if __require_tab_separator(section):
            match_obj = re.match(r'^(\s*)([^\t]+)(?:\t(.*))?$', line)
        else:
            match_obj = re.match(r'^(\s*)(\S+)(?:\s+(.*))?$', line)
        assert match_obj, "Error parsing line: '%s'" % line
        indent, terms, args = match_obj.groups()
        terms = [i.strip() for i in terms.split("|") if i.strip() != ""]
        if args is None or args.strip() == "":
            args = []
        else:
            args = [i.strip() for i in args.split(",") if i.strip() != ""]

        # older configs allowed space in term strings, splitting those
        # from arguments by space. Trying to parse one of these in the
        # new way will result in a crash from space in arguments.
        # The following is a workaround for the transition.
        if [i for i in args if re.search(r'\s', i)] and '\t' in line:
            # re-parse in the old way (dups from above)
            match_obj = re.match(r'^(\s*)([^\t]+)(?:\t(.*))?$', line)
            assert match_obj, "Error parsing line: '%s'" % line
            indent, terms, args = match_obj.groups()
            terms = [i.strip() for i in terms.split("|") if i.strip() != ""]
            if args is None or args.strip() == "":
                args = []
            else:
                args = [i.strip() for i in args.split(",") if i.strip() != ""]
            # issue a warning
            Messager.warning(
                "Space in term name(s) (%s) on line \"%s\" "
                "in config. This feature is deprecated and "
                "support will be removed in future versions. "
                "Please revise your configuration." %
                (",".join(['"%s"' % i for i in terms if " " in i]), line), 20)

        # depth in the ontology corresponds to the number of
        # spaces in the initial indent.
        depth = len(indent)

        # expand <INHERIT> into parent arguments
        expanded_args = []
        for a in args:
            if a != '<INHERIT>':
                expanded_args.append(a)
            else:
                assert depth-1 in last_args_at_depth, \
                    "Error no parent for '%s'" % line
                expanded_args.extend(last_args_at_depth[depth - 1])

        args = expanded_args

        n = TypeHierarchyNode(terms, args)
        if depth == 0:
            # root level, no children assignments
            root_nodes.append(n)
        else:
            # assign as child of last node at the depth of the parent
            assert depth-1 in last_node_at_depth, \
                "Error: no parent for '%s'" % line
            last_node_at_depth[depth - 1].children.append(n)
        last_node_at_depth[depth] = n
        last_args_at_depth[depth] = args

    return root_nodes