Beispiel #1
0
    def _init_grammar(self):
        """
        Extracts all grammar symbol (nonterminal and terminal) from the
        grammar, resolves and check references in productions, unify all
        grammar symbol objects and enumerate production.
        """
        self.nonterminals = set()
        self.terminals = set()

        # Reserve 0 production. It is used for augmented prod. in LR
        # automata calculation.
        self.productions.insert(
            0, Production(AUGSYMBOL, ProductionRHS([self.root_symbol, STOP])))

        self._collect_grammar_symbols()

        # Add special terminals
        self._by_name['EMPTY'] = EMPTY
        self._by_name['EOF'] = EOF
        self._by_name['STOP'] = STOP
        self.terminals.update([EMPTY, EOF, STOP])

        # Connect recognizers, override grammar provided
        if not self._no_check_recognizers:
            for term in self.terminals:
                if not self.recognizers and term.recognizer is None:
                    raise GrammarError(
                        'Terminal "{}" has no recognizer defined '
                        'and no recognizers are given during grammar '
                        'construction.'.format(term.name))
                if term.name not in self.recognizers:
                    if term.recognizer is None:
                        raise GrammarError(
                            'Terminal "{}" has no recognizer defined.'.format(
                                term.name))
                else:
                    term.recognizer = self.recognizers[term.name]

        self._resolve_references()

        # At the end remove terminal productions as those are not the real
        # productions, but just a symbolic names for terminals.
        non_term_productions = [
            p for p in self.productions
            if isinstance(p.symbol, NonTerminal) or p.symbol.name == 'LAYOUT'
        ]
        if len(non_term_productions) > 1:
            # We have non-terminals
            self.productions[:] = non_term_productions

        self._enumerate_productions()
        self._fix_keyword_terminals()
Beispiel #2
0
    def _resolve_references(self):
        """
        Resolve all references and unify objects so that we have single
        instances of each terminal and non-terminal in the grammar.
        Create Terminal for user supplied Recognizer.
        """

        rec_to_term = {}

        for idx, p in enumerate(self.productions):

            if p.symbol.name in self._by_name:
                p.symbol = self._by_name[p.symbol.name]

            if type(p.symbol) is NonTerminal:
                p.symbol.productions.append(p)

            for idx_ref, ref in enumerate(p.rhs):
                ref_sym = None
                if ref.name in self._by_name:
                    ref_sym = self._by_name[ref.name]
                elif isinstance(p.symbol, NonTerminal) \
                        and ref.name in self._rec_to_named_term:
                    # If terminal is registered by str recognizer and is
                    # referenced in a RHS of some other production report
                    # error.
                    term_by_rec = self._rec_to_named_term[ref.name]
                    raise GrammarError(
                        "Terminal '{}' used in production '{}' "
                        "already exists by the name '{}'.".format(
                            text(ref.name), text(p.symbol), text(term_by_rec)))
                else:
                    if not isinstance(ref, Terminal):
                        raise GrammarError(
                            "Unknown symbol '{}' used in production '{}'.".
                            format(text(ref.name), text(p.symbol)))

                    if ref.name in rec_to_term:
                        ref_sym = rec_to_term[ref.name]
                    else:
                        ref_sym = ref
                        rec_to_term[ref.name] = ref
                        self.terminals.add(ref_sym)

                if not ref_sym:
                    raise GrammarError(
                        "Unknown symbol '{}' referenced from production '{}'.".
                        format(ref.name, text(p)))

                p.rhs[idx_ref] = ref_sym
Beispiel #3
0
    def _create_productions(productions, start_symbol=None):
        """Creates Production instances from the list of productions given in
        the form:
        [LHS, RHS, optional ASSOC, optional PRIOR].
        Where LHS is grammar symbol and RHS is a list or tuple of grammar
        symbols from the right-hand side of the production.
        """
        gp = []
        for p in productions:
            assoc = ASSOC_NONE
            prior = DEFAULT_PRIORITY
            symbol = p[0]
            if not isinstance(symbol, NonTerminal):
                raise GrammarError("Invalid production symbol '{}' "
                                   "for production '{}'".format(
                                       symbol, text(p)))
            rhs = ProductionRHS(p[1])
            if len(p) > 2:
                assoc = p[2]
            if len(p) > 3:
                prior = p[3]

            # Convert strings to Terminals with string recognizers
            for idx, t in enumerate(rhs):
                if isinstance(t, text):
                    rhs[idx] = Terminal(t)

            gp.append(Production(symbol, rhs, assoc=assoc, prior=prior))

        return gp
Beispiel #4
0
    def _fix_keyword_terminals(self):
        """
        If KEYWORD terminal with regex match is given fix all matching string
        recognizers to match on word boundary.
        """
        keyword_term = self.get_terminal('KEYWORD')
        if keyword_term is None:
            return

        # KEYWORD rule must have a regex recognizer
        keyword_rec = keyword_term.recognizer
        if not isinstance(keyword_rec, RegExRecognizer):
            raise GrammarError(
                'KEYWORD rule must have a regex recognizer defined.')

        # Change each string recognizer corresponding to the KEYWORD
        # regex by the regex recognizer that match on word boundaries.
        for prod in self:
            if isinstance(prod, Terminal):
                term = prod
                if isinstance(term.recognizer, StringRecognizer):
                    match = keyword_rec(term.recognizer.value, 0)
                    if match == term.recognizer.value:
                        term.recognizer = RegExRecognizer(
                            r'\b{}\b'.format(match),
                            ignore_case=term.recognizer.ignore_case)
                        term.keyword = True
Beispiel #5
0
    def _resolve_references(self):
        """
        Resolve all references and unify objects so that we have single
        instances of each terminal and non-terminal in the grammar.
        Create Terminal for user supplied Recognizer.
        """

        for idx, p in enumerate(self.productions):
            if p.symbol.name in self._by_name:
                p.symbol = self._by_name[p.symbol.name]
            for idx_ref, ref in enumerate(p.rhs):
                ref_sym = None
                if ref.name in self._by_name:
                    ref_sym = self._by_name[ref.name]
                else:
                    if isinstance(ref, Terminal):
                        # Register terminal by name
                        ref_sym = ref
                        self._by_name[ref.name] = ref_sym

                        # If terminal is registered by str recognizer and is
                        # referenced in a RHS of some other production report
                        # error.
                        if not isinstance(p.symbol, Terminal):
                            term_by_rec = self._term_to_lhs.get(ref.name)
                            if term_by_rec:
                                raise GrammarError(
                                    "Terminal '{}' used in production '{}' "
                                    "already exists by the name '{}'.".format(
                                        text(ref.name), text(p.symbol),
                                        text(term_by_rec)))
                        self.terminals.add(ref_sym)

                    else:
                        # Element of RHS must be either a Terminal, a
                        # NonTerminal or a Reference.
                        assert isinstance(ref, NonTerminal) \
                            or isinstance(ref, Reference)

                if not ref_sym:
                    raise GrammarError(
                        "Unknown symbol '{}' referenced from production '{}'.".
                        format(ref.name, text(p)))

                p.rhs[idx_ref] = ref_sym
Beispiel #6
0
def check_name(context, name):
    """
    Used in actions to check for reserved names usage.
    """

    if name in RESERVED_SYMBOL_NAMES:
        from parglare.parser import pos_to_line_col
        raise GrammarError('Rule name "{}" is reserved at {}.'.format(
            name, pos_to_line_col(context.input_str, context.start_position)))
Beispiel #7
0
 def __init__(self, regex, re_flags=re.MULTILINE, ignore_case=False):
     super(RegExRecognizer, self).__init__(regex)
     self._regex = regex
     self.ignore_case = ignore_case
     if ignore_case:
         re_flags |= re.IGNORECASE
     self.re_flags = re_flags
     try:
         self.regex = re.compile(self._regex, re_flags)
     except re.error as ex:
         regex = esc_control_characters(self._regex)
         message = 'Regex compile error in /{}/ (report: "{}")'
         raise GrammarError(message.format(regex, str(ex)))
Beispiel #8
0
    def prod_callable(new_nt):
        if sep_ref:
            from parglare import pos_to_line_col
            raise GrammarError('Repetition modifier not allowed for '
                               'optional (?) for symbol "{}" at {}.'.format(
                                   gsymbol.name,
                                   pos_to_line_col(context.input_str,
                                                   context.start_position)))
        # Optional
        new_productions = [
            Production(new_nt, ProductionRHS([gsymbol])),
            Production(new_nt, ProductionRHS([EMPTY]))
        ]

        return new_productions
Beispiel #9
0
    def _resolve_action(self, old_symbol, new_symbol):
        """
        Checks and resolves common semantic actions given in the grammar.
        """
        # Get/check grammar actions for rules/symbols.
        if new_symbol.action_name:
            if new_symbol.action_name != old_symbol.action_name:
                raise GrammarError(
                    'Multiple different grammar actions for rule "{}".'.format(
                        new_symbol.name))

            # Try to find action in built-in actions module
            # If action is not given we suppose that it is a user defined
            # action that will be provided during parser instantiation
            # using `actions` param.
            import parglare.actions as actmodule
            if hasattr(actmodule, new_symbol.action_name):
                new_symbol.action = \
                    new_symbol.grammar_action = getattr(actmodule,
                                                        new_symbol.action_name)
Beispiel #10
0
def create_table(grammar,
                 itemset_type=LR_1,
                 start_production=1,
                 prefer_shifts=False,
                 prefer_shifts_over_empty=True):
    """
    Arguments:
    grammar (Grammar):
    itemset_type(int) - SRL=0 LR_1=1. By default LR_1.
    start_production(int) - The production which defines start state.
        By default 1 - first production from the grammar.
    prefer_shifts(bool) - Conflict resolution strategy which favours SHIFT over
        REDUCE (gready). By default False.
    prefer_shifts_over_empty(bool) - Conflict resolution strategy which favours
        SHIFT over REDUCE of EMPTY. By default False. If prefer_shifts is
        `True` this param is ignored.
    """

    first_sets = first(grammar)

    # Check for states with GOTO links but without SHIFT links.
    # This is invalid as the GOTO link will never be traversed.
    for nt, firsts in first_sets.items():
        if nt.name != 'S\'' and not firsts:
            raise GrammarError(
                location=nt.location,
                message='First set empty for grammar symbol "{}". '
                'An infinite recursion on the '
                'grammar symbol.'.format(nt))

    follow_sets = follow(grammar, first_sets)

    start_prod_symbol = grammar.productions[start_production].symbol
    grammar.productions[0].rhs = ProductionRHS([start_prod_symbol, STOP])

    # Create a state for the first production (augmented)
    s = LRState(grammar, 0, AUGSYMBOL,
                [LRItem(grammar.productions[0], 0, set())])

    state_queue = [s]
    state_id = 1

    states = []

    while state_queue:
        # For each state calculate its closure first, i.e. starting from a
        # so called "kernel items" expand collection with non-kernel items.
        # We will also calculate GOTO and ACTIONS dicts for each state. These
        # dicts will be keyed by a grammar symbol.
        state = state_queue.pop(0)
        closure(state, itemset_type, first_sets)
        states.append(state)

        # To find out other states we examine following grammar symbols
        # in the current state (symbols following current position/"dot")
        # and group all items by a grammar symbol.
        state._per_next_symbol = OrderedDict()

        # Each production has a priority. But since productions are grouped
        # by grammar symbol that is ahead we take the maximal
        # priority given for all productions for the given grammar symbol.
        state._max_prior_per_symbol = {}

        for item in state.items:
            symbol = item.symbol_at_position
            if symbol:
                state._per_next_symbol.setdefault(symbol, []).append(item)

                # Here we calculate max priorities for each grammar symbol to
                # use it for SHIFT/REDUCE conflict resolution
                prod_prior = item.production.prior
                old_prior = state._max_prior_per_symbol.setdefault(
                    symbol, prod_prior)
                state._max_prior_per_symbol[symbol] = max(
                    prod_prior, old_prior)

        # For each group symbol we create new state and form its kernel
        # items from the group items with positions moved one step ahead.
        for symbol, items in state._per_next_symbol.items():
            inc_items = [item.get_pos_inc() for item in items]
            maybe_new_state = LRState(grammar, state_id, symbol, inc_items)
            target_state = maybe_new_state
            try:
                idx = states.index(maybe_new_state)
                target_state = states[idx]
            except ValueError:
                try:
                    idx = state_queue.index(maybe_new_state)
                    target_state = state_queue[idx]
                except ValueError:
                    pass

            # We've found a new state. Register it for later processing.
            if target_state is maybe_new_state:
                state_queue.append(target_state)
                state_id += 1
            else:
                # State with this kernel items already exists.
                if itemset_type is LR_1:
                    # LALR: Try to merge states, i.e. update items follow sets.
                    if not merge_states(target_state, maybe_new_state):
                        target_state = maybe_new_state
                        state_queue.append(target_state)
                        state_id += 1

            # Create entries in GOTO and ACTION tables
            if isinstance(symbol, NonTerminal):
                # For each non-terminal symbol we create an entry in GOTO
                # table.
                state.gotos[symbol] = target_state

            else:
                if symbol is STOP:
                    state.actions[symbol] = [
                        Action(ACCEPT, state=target_state)
                    ]
                else:
                    # For each terminal symbol we create SHIFT action in the
                    # ACTION table.
                    state.actions[symbol] = [Action(SHIFT, state=target_state)]

    # For LR(1) itemsets refresh/propagate item's follows as the LALR
    # merging might change item's follow in previous states
    if itemset_type is LR_1:

        # Propagate updates as long as there were items propagated in the last
        # loop run.
        update = True
        while update:
            update = False

            for state in states:

                # First refresh current state's follows
                closure(state, LR_1, first_sets)

                # Propagate follows to next states. GOTOs/ACTIONs keep
                # information about states created from this state
                inc_items = [i.get_pos_inc() for i in state.items]
                for target_state in chain(state.gotos.values(), [
                        a.state for i in state.actions.values()
                        for a in i if a.action is SHIFT
                ]):
                    for next_item in target_state.kernel_items:
                        this_item = inc_items[inc_items.index(next_item)]
                        if this_item.follow.difference(next_item.follow):
                            update = True
                            next_item.follow.update(this_item.follow)

    # Calculate REDUCTION entries in ACTION tables and resolve possible
    # conflicts.
    for state in states:
        actions = state.actions

        for item in state.items:
            if item.is_at_end:
                # If the position is at the end then this item
                # would call for reduction but only for terminals
                # from the FOLLOW set of item (LR(1)) or the production LHS
                # non-terminal (LR(0)).
                if itemset_type is LR_1:
                    follow_set = item.follow
                else:
                    follow_set = follow_sets[item.production.symbol]

                prod = item.production
                new_reduce = Action(REDUCE, prod=prod)

                for terminal in follow_set:
                    if terminal not in actions:
                        actions[terminal] = [new_reduce]
                    else:
                        # Conflict! Try to resolve
                        t_acts = actions[terminal]
                        should_reduce = True

                        # Only one SHIFT or ACCEPT might exists for a single
                        # terminal.
                        shifts = [
                            x for x in t_acts if x.action in (SHIFT, ACCEPT)
                        ]
                        assert len(shifts) <= 1
                        t_shift = shifts[0] if shifts else None

                        # But many REDUCEs might exist
                        t_reduces = [x for x in t_acts if x.action is REDUCE]

                        # We should try to resolve using standard
                        # disambiguation rules between current reduction and
                        # all previous actions.

                        if t_shift:
                            # SHIFT/REDUCE conflict. Use assoc and priority to
                            # resolve
                            sh_prior = state._max_prior_per_symbol[
                                t_shift.state.symbol]
                            if prod.prior == sh_prior:
                                if prod.assoc == ASSOC_LEFT:
                                    # Override SHIFT with this REDUCE
                                    actions[terminal].remove(t_shift)
                                elif prod.assoc == ASSOC_RIGHT:
                                    # If associativity is right leave SHIFT
                                    # action as "stronger" and don't consider
                                    # this reduction any more. Right
                                    # associative reductions can't be in the
                                    # same set of actions together with SHIFTs.
                                    should_reduce = False
                                else:
                                    # If priorities are the same and no
                                    # associativity defined use prefered
                                    # strategy.
                                    is_empty = len(prod.rhs) == 0
                                    prod_pse = is_empty \
                                        and prefer_shifts_over_empty \
                                        and not prod.nopse
                                    prod_ps = not is_empty \
                                        and prefer_shifts and not prod.nops
                                    should_reduce = not (prod_pse or prod_ps)
                            elif prod.prior > sh_prior:
                                # This item operation priority is higher =>
                                # override with reduce
                                actions[terminal].remove(t_shift)
                            else:
                                # If priority of existing SHIFT action is
                                # higher then leave it instead
                                should_reduce = False

                        if should_reduce:
                            if not t_reduces:
                                actions[terminal].append(new_reduce)
                            else:
                                # REDUCE/REDUCE conflicts
                                # Try to resolve using priorities
                                if prod.prior == t_reduces[0].prod.prior:
                                    actions[terminal].append(new_reduce)
                                elif prod.prior > t_reduces[0].prod.prior:
                                    # If this production priority is higher
                                    # it should override all other reductions.
                                    actions[terminal][:] = \
                                        [x for x in actions[terminal]
                                         if x.action is not REDUCE]
                                    actions[terminal].append(new_reduce)

    table = LRTable(states)
    return table
Beispiel #11
0
def act_production_rule(context, nodes):
    name, _, rhs_prods, __ = nodes

    symbol = NonTerminal(name)

    # Collect all productions for this rule
    prods = []
    attrs = {}
    for prod in rhs_prods:
        assignments, disrules = prod
        # Here we know the indexes of assignments
        for idx, a in enumerate(assignments):
            if a.name:
                a.index = idx
        gsymbols = (a.symbol for a in assignments)
        assoc = disrules.get('assoc', ASSOC_NONE)
        prior = disrules.get('priority', DEFAULT_PRIORITY)
        dynamic = disrules.get('dynamic', False)
        prods.append(
            Production(symbol,
                       ProductionRHS(gsymbols),
                       assignments=assignments,
                       assoc=assoc,
                       prior=prior,
                       dynamic=dynamic))

        for a in assignments:
            if a.name:
                attrs[a.name] = PGAttribute(a.name, a.multiplicity,
                                            a.orig_symbol.name)
            # TODO: check/handle multiple assignments to the same attribute

    # If named matches are used create Python class that will be used
    # for object instantiation.
    if attrs:

        class ParglareMetaClass(type):
            def __repr__(cls):
                return '<parglare:{} class at {}>'.format(name, id(cls))

        @add_metaclass(ParglareMetaClass)
        class ParglareClass(object):
            """Dynamicaly created class. Each parglare rule that uses named
            matches by default uses this action that will create Python object
            of this class.

            Attributes:
                _pg_attrs(dict): A dict of meta-attributes keyed by name.
                    Used by common rules.
                _pg_position(int): A position in the input string where
                    this class is defined.
                _pg_position_end(int): A position in the input string where
                    this class ends.

            """

            _pg_attrs = attrs

            def __init__(self, **attrs):
                for attr_name, attr_value in attrs.items():
                    setattr(self, attr_name, attr_value)

            def __repr__(self):
                if hasattr(self, 'name'):
                    return "<{}:{}>".format(name, self.name)
                else:
                    return "<parglare:{} instance at {}>"\
                        .format(name, hex(id(self)))

        ParglareClass.__name__ = str(name)
        if name in context.classes:
            raise GrammarError(
                'Multiple definition for Rule/Class "{}"'.format(name))
        context.classes[name] = ParglareClass
        symbol.action_name = 'obj'

    return prods