def do(sh):
    """Converts a uni-code string into a state machine that parses 
       its letters sequentially. Each state in the sequence correponds
       to the sucessful triggering of a letter. Only the last state, though,
       is an acceptance state. Any bailing out before is 'not accepted'. 
       Example:

       "hey" is translated into the state machine:

           (0)-- 'h' -->(1)-- 'e' -->(2)-- 'y' --> ACCEPTANCE
            |            |            |
           FAIL         FAIL         FAIL
    
      Note: The state indices are globally unique. But, they are not necessarily
            0, 1, 2, ... 
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    # resulting state machine
    result    = StateMachine()
    state_idx = result.init_state_index

    # Only \" is a special character '"', any other backslashed character
    # remains as the sequence 'backslash' + character
    for char_code in get_character_code_sequence(sh):
        state_idx = result.add_transition(state_idx, char_code)

    # when the last state has trigger it is supposed to end up in 'acceptance'
    result.states[state_idx].set_acceptance()
    return result
Ejemplo n.º 2
0
def do(sh):
    """Converts a uni-code string into a state machine that parses 
       its letters sequentially. Each state in the sequence correponds
       to the sucessful triggering of a letter. Only the last state, though,
       is an acceptance state. Any bailing out before is 'not accepted'. 
       Example:

       "hey" is translated into the state machine:

           (0)-- 'h' -->(1)-- 'e' -->(2)-- 'y' --> ACCEPTANCE
            |            |            |
           FAIL         FAIL         FAIL
    
      Note: The state indices are globally unique. But, they are not necessarily
            0, 1, 2, ... 
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    # resulting state machine
    result = StateMachine()
    state_idx = result.init_state_index

    # Only \" is a special character '"', any other backslashed character
    # remains as the sequence 'backslash' + character
    for char_code in get_character_code_sequence(sh):
        state_idx = result.add_transition(state_idx, char_code)

    # when the last state has trigger it is supposed to end up in 'acceptance'
    result.states[state_idx].set_acceptance()
    return result
Ejemplo n.º 3
0
def create_ALL_BUT_NEWLINE_state_machine():
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n")).inverse()) 

    result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) 
    return result
Ejemplo n.º 4
0
def create_ALL_BUT_NEWLINE_state_machine():
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n")).inverse())

    result.add_transition(result.init_state_index,
                          trigger_set,
                          AcceptanceF=True)
    return result
Ejemplo n.º 5
0
def create_state_machine(SM, StateSetList):
    # If all states are of size one, this means, that there were no states that
    # could have been combined. In this case a simple copy of the original
    # state machine will do.
    if filter(lambda state_set: len(state_set) != 1,
              StateSetList.state_set_list) == []:
        return SM.clone()

    # Define a mapping from the state set to a new target state index
    map_new_state_index = {}
    for state_set_index in range(len(StateSetList.state_set_list)):
        map_new_state_index[state_set_index] = state_machine_index.get()

    # The state set that contains the initial state becomes the initial state of
    # the new state machine.
    state_set_containing_initial_state_i = StateSetList.map[
        SM.init_state_index]
    result = StateMachine(
        map_new_state_index[state_set_containing_initial_state_i],
        Core=SM.core())

    # Ensure that each target state index has a state inside the state machine
    for new_state_index in map_new_state_index.values():
        result.create_new_state(StateIdx=new_state_index)

    # Build up the state machine out of the remaining state sets
    state_set_idx = -1L
    for state_set in StateSetList.state_set_list:
        state_set_idx += 1L
        assert len(state_set) != 0, "State set of size '0'. List = " + repr(
            StateSetList)

        # The prototype: States in one set behave all equivalent with respect to target state sets
        # thus only one state from the start set has to be considered.
        prototype = SM.states[state_set[0]]
        # The representive: shall represent the state set in the new state machine.
        representive = result.states[map_new_state_index[state_set_idx]]

        # The representive must have all transitions that the prototype has
        for target_state_index, trigger_set in prototype.transitions().get_map(
        ).items():
            target_state_set_index = StateSetList.map[target_state_index]
            representive.add_transition(
                trigger_set, map_new_state_index[target_state_set_index])

        # Merge all core information of the states inside the state set.
        # If one state set contains an acceptance state, then the result is 'acceptance'.
        # (Note: The initial split separates acceptance states from those that are not
        #  acceptance states. There can be no state set containing acceptance and
        #  non-acceptance states)
        # (Note, that the prototype's info has not been included yet, consider whole set)
        for state_idx in state_set:
            representive.merge(SM.states[state_idx])

    return result
Ejemplo n.º 6
0
def snap_non_control_character(stream, PatternDict):
    __debug_entry("non-control characters", stream)

    # (*) read first character
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    if char_code == 0xFF:
        error_msg(
            "Character could not be interpreted as UTF8 code or End of File reached prematurely.",
            stream)
    result = StateMachine()
    result.add_transition(result.init_state_index, char_code, AcceptanceF=True)
    return __debug_exit(result, stream)
def create_state_machine(SM, StateSetList):
    # If all states are of size one, this means, that there were no states that
    # could have been combined. In this case a simple copy of the original
    # state machine will do.
    if filter(lambda state_set: len(state_set) != 1, StateSetList.state_set_list) == []:
        return SM.clone()
    
    # Define a mapping from the state set to a new target state index
    map_new_state_index = {}
    for state_set_index in range(len(StateSetList.state_set_list)):
        map_new_state_index[state_set_index] = state_machine_index.get()
                
    # The state set that contains the initial state becomes the initial state of 
    # the new state machine.   
    state_set_containing_initial_state_i = StateSetList.map[SM.init_state_index]
    result = StateMachine(map_new_state_index[state_set_containing_initial_state_i],
                          Core = SM.core())

    # Ensure that each target state index has a state inside the state machine
    for new_state_index in map_new_state_index.values():
        result.create_new_state(StateIdx=new_state_index)

    # Build up the state machine out of the remaining state sets
    state_set_idx = -1L
    for state_set in StateSetList.state_set_list:
        state_set_idx += 1L
        assert len(state_set) != 0, "State set of size '0'. List = " + repr(StateSetList)

        # The prototype: States in one set behave all equivalent with respect to target state sets
        # thus only one state from the start set has to be considered.      
        prototype    = SM.states[state_set[0]]
        # The representive: shall represent the state set in the new state machine.
        representive = result.states[map_new_state_index[state_set_idx]]

        # The representive must have all transitions that the prototype has
        for target_state_index, trigger_set in prototype.transitions().get_map().items():
            target_state_set_index = StateSetList.map[target_state_index]
            representive.add_transition(trigger_set, 
                                        map_new_state_index[target_state_set_index])

        # Merge all core information of the states inside the state set.
        # If one state set contains an acceptance state, then the result is 'acceptance'.
        # (Note: The initial split separates acceptance states from those that are not
        #  acceptance states. There can be no state set containing acceptance and 
        #  non-acceptance states) 
        # (Note, that the prototype's info has not been included yet, consider whole set)
        for state_idx in state_set:
            representive.merge(SM.states[state_idx])

    return result    
Ejemplo n.º 8
0
def create_ALL_BUT_NEWLINE_state_machine():
    global Setup
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n")).inverse())

    if Setup.get_character_value_limit() != sys.maxint:
        trigger_set.intersect_with(
            Interval(0, Setup.get_character_value_limit()))

    result.add_transition(result.init_state_index,
                          trigger_set,
                          AcceptanceF=True)
    return result
Ejemplo n.º 9
0
def do(stream, PatternDict):
    trigger_set = snap_set_expression(stream, PatternDict)

    if trigger_set == None: 
        raise RegularExpressionException("Regular Expression: character_set_expression called for something\n" + \
                                         "that does not start with '[:', '[' or '\\P'")
    if trigger_set.is_empty():
        raise RegularExpressionException("Regular Expression: Character set expression results in empty set.")

    # Create state machine that triggers with the trigger set to SUCCESS
    # NOTE: The default for the ELSE transition is FAIL.
    sm = StateMachine()
    sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True)

    return __debug_exit(sm, stream)
Ejemplo n.º 10
0
def do(stream):
    trigger_set = snap_set_expression(stream)

    if trigger_set == None:
        raise RegularExpressionException("Regular Expression: character_set_expression called for something\n" + \
                                         "that does not start with '[:', '[' or '\\P'")
    if trigger_set.is_empty():
        raise RegularExpressionException(
            "Regular Expression: Character set expression results in empty set."
        )

    # Create state machine that triggers with the trigger set to SUCCESS
    # NOTE: The default for the ELSE transition is FAIL.
    sm = StateMachine()
    sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True)

    return __debug_exit(sm, stream)
Ejemplo n.º 11
0
def __set_end_of_line_post_context(sm, EndOfFileCode=0):
    """Appends a post condition to the state machine to handle the end of line
       statement. This consists in translating 'EndOfLine' into a state machine
       with 'Newline' or 'EndOfFile'. Thus, when one of both follows the current
       character is at the end of a line.

       If you want to use a different code for end of file, specify it via the
       first argument 'EndOfFile'.

       NOTE: This is fundamentally different from beginning of line (BOL). BOL
             can be achieved by letting the state machine raise the corresponding
             flag. End of line post conditions rely on external algorithms for
             mounting a post-condition.
    """
    post_context_sm = StateMachine()
    post_context_sm.add_transition(post_context_sm.init_state_index, ord('\n'), AcceptanceF=True)
    post_context_sm.add_transition(post_context_sm.init_state_index, EndOfFileCode, AcceptanceF=True)

    result = setup_post_context.do(sm, post_context_sm)

    return result
Ejemplo n.º 12
0
def __set_end_of_line_post_context(sm, EndOfFileCode=0):
    """Appends a post condition to the state machine to handle the end of line
       statement. This consists in translating 'EndOfLine' into a state machine
       with 'Newline' or 'EndOfFile'. Thus, when one of both follows the current
       character is at the end of a line.

       If you want to use a different code for end of file, specify it via the
       first argument 'EndOfFile'.

       NOTE: This is fundamentally different from beginning of line (BOL). BOL
             can be achieved by letting the state machine raise the corresponding
             flag. End of line post conditions rely on external algorithms for
             mounting a post-condition.
    """
    post_context_sm = StateMachine()
    post_context_sm.add_transition(post_context_sm.init_state_index,
                                   ord('\n'),
                                   AcceptanceF=True)
    post_context_sm.add_transition(post_context_sm.init_state_index,
                                   EndOfFileCode,
                                   AcceptanceF=True)

    result = setup_post_context.do(sm, post_context_sm)

    return result
Ejemplo n.º 13
0
    def seal(self):
        if len(self.space_db) == 0 and len(self.grid_db) == 0:
            default_space = ord(' ')
            default_tab = ord('\t')
            bad = self.bad_character_set
            if bad.get().contains(default_space) == False:
                self.specify_space("[ ]", NumberSet(default_space), 1, self.fh)
            if bad.get().contains(default_tab) == False:
                self.specify_grid("[\\t]", NumberSet(default_tab), 4, self.fh)

            if len(self.space_db) == 0 and len(self.grid_db) == 0:
                error_msg(
                    "No space or grid defined for indentation counting. Default\n"
                    "values ' ' and '\\t' could not be used since they are specified as 'bad'.",
                    bad.file_name, bad.line_n)

        if self.newline_state_machine.get() == None:
            sm = StateMachine()
            end_idx = sm.add_transition(sm.init_state_index,
                                        NumberSet(ord('\n')),
                                        AcceptanceF=True)
            mid_idx = sm.add_transition(sm.init_state_index,
                                        NumberSet(ord('\r')),
                                        AcceptanceF=False)
            sm.add_transition(mid_idx,
                              NumberSet(ord('\n')),
                              end_idx,
                              AcceptanceF=False)
            self.specify_newline("(\\r\\n)|(\\n)", sm, self.fh)
Ejemplo n.º 14
0
    def seal(self):
        if len(self.space_db) == 0 and len(self.grid_db) == 0:
            default_space = ord(" ")
            default_tab = ord("\t")
            bad = self.bad_character_set
            if bad.get().contains(default_space) == False:
                self.specify_space("[ ]", NumberSet(default_space), 1, self.fh)
            if bad.get().contains(default_tab) == False:
                self.specify_grid("[\\t]", NumberSet(default_tab), 4, self.fh)

            if len(self.space_db) == 0 and len(self.grid_db) == 0:
                error_msg(
                    "No space or grid defined for indentation counting. Default\n"
                    "values ' ' and '\\t' could not be used since they are specified as 'bad'.",
                    bad.file_name,
                    bad.line_n,
                )

        if self.newline_state_machine.get() == None:
            sm = StateMachine()
            end_idx = sm.add_transition(sm.init_state_index, NumberSet(ord("\n")), AcceptanceF=True)
            mid_idx = sm.add_transition(sm.init_state_index, NumberSet(ord("\r")), AcceptanceF=False)
            sm.add_transition(mid_idx, NumberSet(ord("\n")), end_idx, AcceptanceF=False)
            self.specify_newline("(\\r\\n)|(\\n)", sm, self.fh)
Ejemplo n.º 15
0
def do(sm, BeginOfLineF, EndOfLineF, DOS_CarriageReturnNewlineF=False):
    """DOS_CarriageReturnNewlineF == True:  
               '$' is implemented as post-condition '\r\n'. This is required
               or lexical analysers on DOS and Windows machines.
       DOS_CarriageReturnNewlineF == False:
               '$' is implemented as post-condition '\n' -- like the normal
               newline on Unix machines.
    """
    # (1) end of line
    #     NOTE: This must come before 'Begin of File', because there's a post condition
    #           added, that enters new acceptance states.
    if EndOfLineF:
        if sm.core().post_context_id() == -1L:
            # -- create a state machine that represents the post-condition
            # -- mount it to the core pattern as a post-condition
            post_sm = StateMachine()
            if not DOS_CarriageReturnNewlineF:
                state_idx = post_sm.add_transition(post_sm.init_state_index,
                                                   ord('\n'),
                                                   AcceptanceF=True)
            else:
                aux_idx = post_sm.add_transition(post_sm.init_state_index,
                                                 ord('\r'),
                                                 AcceptanceF=False)
                state_idx = post_sm.add_transition(aux_idx,
                                                   ord('\n'),
                                                   AcceptanceF=True)
            ## post_sm.add_transition(post_sm.init_state_index, EndOfFile_Code, state_idx, AcceptanceF=True)

            # post conditions add an epsilon transition that has to be solved
            # by translating state machine into a DFA
            sm = setup_post_context.do(sm, post_sm)
            sm = nfa_to_dfa.do(sm)
            assert sm.has_origins() == False

        else:
            post_context_id = sm.core().post_context_id()
            # end of line in two cases:
            #  (1) next char is '\n' (or \r\n in case of DOS_CarriageReturnNewlineF==True)
            #  (2) at end of file, we supposed anyway that in this case the buffer needs to
            #      end with 'EndOfFile_Code' just before the first letter.
            #
            #  => mount 'newline or EndOfFile_Code' to the tail of pattern
            #
            new_state_idx = __add_line_border_at_end(
                sm, DOS_CarriageReturnNewlineF, InverseF=False)
            # -- the post-context flag needs to be raised
            sm.states[new_state_idx].core().set_post_context_id(
                post_context_id)

    # (2) begin of line
    if BeginOfLineF:
        # begin of line in two cases:
        #  (1) last char was '\n'
        #  (2) the first character is not detected as begin of line, if the
        #      pre-condition is non-trivial.
        #
        #  A line begins always after '\n' so no check for '\r\n' is necessary.
        #  => DOS_CarriageReturnNewlineF = False
        if sm.core().pre_context_sm() != None:
            __add_line_border_at_end(sm.core().pre_context_sm(),
                                     DOS_CarriageReturnNewlineF=False,
                                     InverseF=True)
        else:
            # mark all acceptance states with the 'trivial pre-condition BOL' flag
            for state in sm.get_acceptance_state_list():
                state.core().set_pre_context_begin_of_line_f()
            sm.core().set_pre_context_begin_of_line_f()

    return sm
Ejemplo n.º 16
0
def do(StateMachineList, CommonTerminalStateF=True, CloneF=True):
    """Connect state machines paralell.

       CommonTerminalStateF tells wether the state machines shall trigger 
                            to a common terminal. This is necessary if the
                            state machines are part of a bigger construction.

                            When the ready-to-rumble pattern state machines
                            are to be combined into a single analyzer, the
                            flag must be set to 'False'.

       CloneF               Controls if state machine list is cloned or not.
                            If the single state machines are no longer required after
                            construction, the CloneF can be set to False.

                            If Cloning is disabled the state machines themselves
                            will be altered--which brings some advantage in speed.
    """
    assert type(StateMachineList) == list
    assert len(StateMachineList) != 0
    assert map(lambda x: x.__class__.__name__,
               StateMachineList) == ["StateMachine"] * len(StateMachineList)

    # filter out empty state machines from the consideration
    state_machine_list = filter(lambda sm: not sm.is_empty(), StateMachineList)
    empty_state_machine_occured_f = len(state_machine_list) != len(
        StateMachineList)

    if len(state_machine_list) < 2:
        if len(state_machine_list) < 1: result = StateMachine()
        else: result = state_machine_list[0]
        if empty_state_machine_occured_f:
            result = __add_free_pass(result)
        return result

    # (*) need to clone the state machines, i.e. provide their internal
    #     states with new ids, but the 'behavior' remains. This allows
    #     state machines to appear twice, or being used in 'larger'
    #     conglomerates.
    if CloneF:
        clone_list = map(lambda sm: sm.clone(), state_machine_list)
    else:
        clone_list = state_machine_list

    # (*) collect all transitions from both state machines into a single one
    #     (clone to ensure unique identifiers of states)
    result = StateMachine()
    for clone in clone_list:
        result.states.update(clone.states)

    # (*) add additional **init** and **end** state
    #     NOTE: when the result state machine was created, it already contains a
    #           new initial state index. thus at this point only the new terminal
    #           state has to be created.
    #     NOTE: it is essential that the acceptance flag stays False, at this
    #           point in time, so that the mounting operations only happen on
    #           the old acceptance states. Later the acceptance state is raised
    #           to 'accepted' (see below)
    new_terminal_state_index = -1L
    if CommonTerminalStateF:
        new_terminal_state_index = result.create_new_state()

    # (*) Connect from the new initial state to the initial states of the
    #     clones via epsilon transition.
    #     Connect from each success state of the clones to the new end state
    #     via epsilon transition.
    for clone in clone_list:
        result.mount_to_initial_state(clone.init_state_index)
        if CommonTerminalStateF:
            result.mount_to_acceptance_states(new_terminal_state_index,
                                              CancelStartAcceptanceStateF=True,
                                              RaiseTargetAcceptanceStateF=True,
                                              LeaveStoreInputPositionsF=True)

    # (*) If there was an empty state machine, a 'free pass' is added
    if empty_state_machine_occured_f:
        result = __add_free_pass(result, new_terminal_state_index)

    return result
def do(sm, BeginOfLineF, EndOfLineF, DOS_CarriageReturnNewlineF=False):
    """DOS_CarriageReturnNewlineF == True:  
               '$' is implemented as post-condition '\r\n'. This is required
               or lexical analysers on DOS and Windows machines.
       DOS_CarriageReturnNewlineF == False:
               '$' is implemented as post-condition '\n' -- like the normal
               newline on Unix machines.
    """
    # (1) end of line 
    #     NOTE: This must come before 'Begin of File', because there's a post condition
    #           added, that enters new acceptance states.
    if EndOfLineF:
        if sm.core().post_context_id() == -1L:
            # -- create a state machine that represents the post-condition
            # -- mount it to the core pattern as a post-condition
            post_sm = StateMachine()
            if not DOS_CarriageReturnNewlineF:
                state_idx = post_sm.add_transition(post_sm.init_state_index, ord('\n'), AcceptanceF=True)
            else:
                aux_idx   = post_sm.add_transition(post_sm.init_state_index, ord('\r'), AcceptanceF=False)
                state_idx = post_sm.add_transition(aux_idx, ord('\n'), AcceptanceF=True)
            ## post_sm.add_transition(post_sm.init_state_index, EndOfFile_Code, state_idx, AcceptanceF=True)
            
            # post conditions add an epsilon transition that has to be solved 
            # by translating state machine into a DFA
            sm = setup_post_context.do(sm, post_sm) 
            sm = nfa_to_dfa.do(sm)
            assert sm.has_origins() == False
            
        else:
            post_context_id = sm.core().post_context_id()
            # end of line in two cases:
            #  (1) next char is '\n' (or \r\n in case of DOS_CarriageReturnNewlineF==True)
            #  (2) at end of file, we supposed anyway that in this case the buffer needs to
            #      end with 'EndOfFile_Code' just before the first letter.
            #
            #  => mount 'newline or EndOfFile_Code' to the tail of pattern
            #
            new_state_idx = __add_line_border_at_end(sm, 
                                                     DOS_CarriageReturnNewlineF, InverseF=False)
            # -- the post-context flag needs to be raised
            sm.states[new_state_idx].core().set_post_context_id(post_context_id)

    # (2) begin of line
    if BeginOfLineF: 
        # begin of line in two cases:
        #  (1) last char was '\n'
        #  (2) the first character is not detected as begin of line, if the 
        #      pre-condition is non-trivial.
        #
        #  A line begins always after '\n' so no check for '\r\n' is necessary.
        #  => DOS_CarriageReturnNewlineF = False
        if sm.core().pre_context_sm() != None:
            __add_line_border_at_end(sm.core().pre_context_sm(), 
                                     DOS_CarriageReturnNewlineF=False, InverseF=True)
        else:
            # mark all acceptance states with the 'trivial pre-condition BOL' flag
            for state in sm.get_acceptance_state_list():
                state.core().set_pre_context_begin_of_line_f()
            sm.core().set_pre_context_begin_of_line_f()
                
    return sm
Ejemplo n.º 18
0
def do(SM):
    """Creates a deterministic finite automaton (DFA) from the current state 
       machine - which may be a NFA (non-deterministic finite automaton). This is
       a generlized version of the 'subset construction' algorithm. Where 
       subsection construction focusses on letters of an alphabet for the
       investigation of transitions, this algorithm focusses on elementary
       trigger sets. A very good description of the subset construction 
       algorithm can be found in 'Engineering a Compiler' by Keith Cooper.
    """
    # (*) create the result state machine
    initial_state_epsilon_closure = SM.get_epsilon_closure(SM.init_state_index)

    # NOTE: Later on, state machines with an initial acceptance state are forbidden.
    #       So, acceptance is not a question here. Think about setting it to false anyway.
    result = StateMachine(Core=SM.core())

    # (*) initial state of resulting DFA = epsilon closure of initial state of NFA
    #     -- add the origin list of all states in the epsilon closure
    new_init_state = result.get_init_state()
    for state in map(lambda idx: SM.states[idx],
                     initial_state_epsilon_closure):
        new_init_state.merge(state)

    # (*) prepare the initial worklist
    worklist = [(result.init_state_index, initial_state_epsilon_closure)]

    epsilon_closure_db = SM.get_epsilon_closure_db()

    while worklist != []:
        # 'start_state_index' is the index of an **existing** state in the state machine.
        # It was either created above, in StateMachine's constructor, or as a target
        # state index.
        start_state_index, start_state_combination = worklist.pop()

        # (*) compute the elementary trigger sets together with the
        #     epsilon closure of target state combinations that they trigger to.
        #     In other words: find the ranges of characters where the state triggers to
        #     a unique state combination. E.g:
        #                Range        Target State Combination
        #                [0:23]   --> [ State1, State2, State10 ]
        #                [24:60]  --> [ State1 ]
        #                [61:123] --> [ State2, State10 ]
        #
        elementary_trigger_set_infos = SM.get_elementary_trigger_sets(
            start_state_combination, epsilon_closure_db)
        ## DEBUG_print(start_state_combination, elementary_trigger_set_infos)

        # (*) loop over all elementary trigger sets
        for epsilon_closure_of_target_state_combination, trigger_set in elementary_trigger_set_infos:
            #  -- if there is no trigger to the given target state combination, then drop it
            if trigger_set.is_empty(): continue

            # -- add a new target state representing the state combination
            #    (if this did not happen yet)
            target_state_index = \
                 map_state_combination_to_index(epsilon_closure_of_target_state_combination)

            # -- if target state combination was not considered yet, then create
            #    a new state in the state machine
            if result.states.has_key(target_state_index):
                # -- add only a transition 'start state to target state'
                result.add_transition(start_state_index, trigger_set,
                                      target_state_index)
            else:
                # -- add the transition 'start state to target state'
                #    (create implicitly the new target state in the state machine)
                result.add_transition(start_state_index, trigger_set,
                                      target_state_index)
                # -- merge informations of combined states inside the target state
                new_target_state = result.states[target_state_index]
                for state in map(lambda idx: SM.states[idx],
                                 epsilon_closure_of_target_state_combination):
                    new_target_state.merge(state)

                worklist.append((target_state_index,
                                 epsilon_closure_of_target_state_combination))

    return result
Ejemplo n.º 19
0
def do(the_state_machines):
    """Connect state machines paralell."""
    assert type(the_state_machines) == list
    assert len(the_state_machines) != 0
    assert map(
        lambda x: x.__class__.__name__,
        the_state_machines) == ["StateMachine"] * len(the_state_machines)

    # filter out empty state machines from the consideration
    state_machines = filter(lambda sm: not sm.is_empty(), the_state_machines)

    def __add_optional_free_pass(result_state_machine, TerminationStateIdx=-1):
        """Add an optional 'free pass' if there was an empty state."""
        # if there was an empty state, then the number of elements in the list changed
        # in case there was an empty state one has to add a 'free pass' from begin to
        # the final acceptance state.
        if TerminationStateIdx == -1:
            acceptance_state_index_list = result_state_machine.get_acceptance_state_index_list(
            )
            assert acceptance_state_index_list != [], \
                   "resulting state machine has no acceptance state!"
            TerminationStateIdx = acceptance_state_index_list[0]

        if len(state_machines) != len(the_state_machines):
            result_state_machine.add_epsilon_transition(
                result_state_machine.init_state_index, TerminationStateIdx)
        return result_state_machine

    if len(state_machines) < 2:
        if len(state_machines) < 1:
            return __add_optional_free_pass(StateMachine())
        else:
            return __add_optional_free_pass(state_machines[0])

    # (*) need to clone the state machines, i.e. provide their internal
    #     states with new ids, but the 'behavior' remains. This allows
    #     state machines to appear twice, or being used in 'larger'
    #     conglomerates.
    clone_list = map(lambda sm: sm.clone(), state_machines)

    # (*) collect all transitions from both state machines into a single one
    #     (clone to ensure unique identifiers of states)
    result = StateMachine()
    for clone in clone_list:
        for start_state_index, states in clone.states.items():
            # DOUBT: is deepcopy necessary at this place?
            # ANSWER: it does not harm, because no new state indices are creates
            result.states[start_state_index] = deepcopy(states)

    # (*) add additional **init** and **end** state
    #     NOTE: when the result state machine was created, it already contains a
    #           new initial state index. thus at this point only the new terminal
    #           state has to be created.
    #     NOTE: it is essential that the acceptance flag stays False, at this
    #           point in time, so that the mounting operations only happen on
    #           the old acceptance states. Later the acceptance state is raised
    #           to 'accepted' (see below)
    new_terminal_state_index = result.create_new_state()

    # (*) connect from the new initial state to the initial states of the
    #     clones via epsilon transition.
    #     connect from each success state of the clones to the new end state
    #     via epsilon transition.
    for clone in clone_list:
        result.mount_to_initial_state(clone.init_state_index)
        result.mount_to_acceptance_states(new_terminal_state_index,
                                          CancelStartAcceptanceStateF=True,
                                          RaiseTargetAcceptanceStateF=True,
                                          LeaveStoreInputPositionsF=True)

    return __add_optional_free_pass(result, new_terminal_state_index)
Ejemplo n.º 20
0
def parse_mode_option(fh, new_mode):
    LanguageDB = Setup.language_db

    def fit_state_machine(SM):
        if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM)
        else: result = SM
        result = hopcroft.do(result, CreateNewStateMachineF=False)
        return result

    identifier = read_option_start(fh)
    if identifier == None: return False

    verify_word_in_list(identifier, lexer_mode.mode_option_info_db.keys(),
                        "mode option", fh.name,
                        get_current_line_info_number(fh))

    if identifier == "skip":
        # A skipper 'eats' characters at the beginning of a pattern that belong
        # to a specified set of characters. A useful application is most probably
        # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to
        # implement a very effective way to skip these regions.
        pattern_str, trigger_set = regular_expression.parse_character_set(
            fh, PatternStringF=True)
        skip_whitespace(fh)

        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'." % identifier,
                      fh)

        if trigger_set.is_empty():
            error_msg("Empty trigger set for skipper." % identifier, fh)

        # TriggerSet skipping is implemented the following way: As soon as one element of the
        # trigger set appears, the state machine enters the 'trigger set skipper section'.
        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)'
        pattern_sm = StateMachine()
        pattern_sm.add_transition(pattern_sm.init_state_index,
                                  trigger_set,
                                  AcceptanceF=True)

        # Skipper code is to be generated later
        action = GeneratedCode(skip_character_set.do,
                               FileName=fh.name,
                               LineN=get_current_line_info_number(fh))
        action.data["character_set"] = trigger_set

        pattern_sm = fit_state_machine(pattern_sm)
        # For skippers line and column counting detection is not really a topic
        # It is done in the skipper itself.
        pattern_sm.side_info = SideInfo()

        new_mode.add_match(pattern_str, action, pattern_sm)

        return True

    elif identifier in ["skip_range", "skip_nested_range"]:
        # A non-nesting skipper can contain a full fledged regular expression as opener,
        # since it only effects the trigger. Not so the nested range skipper-see below.

        # -- opener
        skip_whitespace(fh)
        if identifier == "skip_nested_range":
            # Nested range state machines only accept 'strings' not state machines
            opener_str, opener_sequence = parse_string_constant(
                fh, "Opener pattern for 'skip_nested_range'")

            opener_sm = StateMachine()
            idx = opener_sm.init_state_index
            for letter in opener_sequence:
                idx = opener_sm.add_transition(idx, letter)
            opener_sm.states[idx].set_acceptance(True)
        else:
            opener_str, opener_sm = regular_expression.parse(fh)
            # For 'range skipping' the opener sequence is not needed, only the opener state
            # machine is webbed into the pattern matching state machine.
            opener_sequence = None

        skip_whitespace(fh)

        # -- closer
        closer_str, closer_sequence = parse_string_constant(
            fh, "Closing pattern for 'skip_range' or 'skip_nested_range'")
        skip_whitespace(fh)
        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'" % identifier,
                      fh)

        # Skipper code is to be generated later
        generator_function = {
            "skip_range": skip_range.do,
            "skip_nested_range": skip_nested_range.do,
        }[identifier]
        action = GeneratedCode(generator_function,
                               FileName=fh.name,
                               LineN=get_current_line_info_number(fh))

        action.data["opener_sequence"] = opener_sequence
        action.data["closer_sequence"] = closer_sequence
        action.data["mode_name"] = new_mode.name

        fit_state_machine(opener_sm)

        # For skippers line and column counting detection is not really a topic
        # It is done in the skipper itself.
        opener_sm.side_info = SideInfo()

        new_mode.add_match(opener_str, action, opener_sm)

        return True

    elif identifier == "indentation":
        value = indentation_setup.do(fh)

        # Enter 'Newline' and 'Suppressed Newline' as matches into the engine.
        # Similar to skippers, the indentation count is then triggered by the newline.
        # -- Suppressed Newline = Suppressor followed by Newline,
        #    then newline does not trigger indentation counting.
        suppressed_newline_pattern = ""
        if value.newline_suppressor_state_machine.get() != None:
            suppressed_newline_pattern = \
                  "(" + value.newline_suppressor_state_machine.pattern_str + ")" \
                + "(" + value.newline_state_machine.pattern_str + ")"

            suppressed_newline_sm = \
                sequentialize.do([value.newline_suppressor_state_machine.get(),
                                  value.newline_state_machine.get()])

            FileName = value.newline_suppressor_state_machine.file_name
            LineN = value.newline_suppressor_state_machine.line_n
            # Go back to start.
            code_fragment = UserCodeFragment(
                "goto %s;" % get_label("$start", U=True), FileName, LineN)

            suppressed_newline_sm = fit_state_machine(suppressed_newline_sm)

            # Analyze pattern for constant number of newlines, characters, etc.
            suppressed_newline_sm.side_info = SideInfo(
                character_counter.get_newline_n(suppressed_newline_sm),
                character_counter.get_character_n(suppressed_newline_sm))

            new_mode.add_match(suppressed_newline_pattern,
                               code_fragment,
                               suppressed_newline_sm,
                               Comment="indentation newline suppressor")

        # When there is an empty line, then there shall be no indentation count on it.
        # Here comes the trick:
        #
        #      Let               newline
        #      be defined as:    newline ([space]* newline])*
        #
        # This way empty lines are eating away before the indentation count is activated.

        # -- 'space'
        x0 = StateMachine()
        x0.add_transition(x0.init_state_index,
                          value.indentation_count_character_set(),
                          AcceptanceF=True)
        # -- '[space]*'
        x1 = repeat.do(x0)
        # -- '[space]* newline'
        x2 = sequentialize.do([x1, value.newline_state_machine.get()])
        # -- '([space]* newline)*'
        x3 = repeat.do(x2)
        # -- 'newline ([space]* newline)*'
        x4 = sequentialize.do([value.newline_state_machine.get(), x3])
        # -- nfa to dfa; hopcroft optimization
        sm = hopcroft.do(nfa_to_dfa.do(x4), CreateNewStateMachineF=False)

        FileName = value.newline_state_machine.file_name
        LineN = value.newline_state_machine.line_n
        action = GeneratedCode(indentation_counter.do, FileName, LineN)

        action.data["indentation_setup"] = value

        sm = fit_state_machine(sm)
        sm.side_info = SideInfo(character_counter.get_newline_n(sm),
                                character_counter.get_character_n(sm))
        new_mode.add_match(value.newline_state_machine.pattern_str,
                           action,
                           sm,
                           Comment="indentation newline")

        # Announce the mode to which the setup belongs
        value.set_containing_mode_name(new_mode.name)
    else:
        value = read_option_value(fh)

    # The 'verify_word_in_list()' call must have ensured that the following holds
    assert lexer_mode.mode_option_info_db.has_key(identifier)

    # Is the option of the appropriate value?
    option_info = lexer_mode.mode_option_info_db[identifier]
    if option_info.domain != None and value not in option_info.domain:
        error_msg("Tried to set value '%s' for option '%s'. " % (Value, Option) + \
                  "Though, possible for this option are only: %s." % repr(oi.domain)[1:-1], fh)

    # Finally, set the option
    new_mode.add_option(identifier, value)

    return True
Ejemplo n.º 21
0
def snap_non_control_characters(stream):
    """Snaps any 'non_control_character' using UTF8 encoding from the given string. Note, that 
       in UTF8 a character may consist of more than one byte. Creates a state machine 
       that contains solely one trigger for each character to a acceptance state.

       This function **concatinates** incoming characters, but **repetition** has preceedence
       over concatination, so it checks after each character whether it is followed by
       a repetition ('*', '+', '?', '{..}'). In such a case, the repetition of the character
       is appended.
    """
    __debug_entry("non-control characters", stream)

    result      = StateMachine()
    state_index = result.init_state_index
    # (*) read first character
    position  = stream.tell()
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    while char_code != 0xFF:
        # (1) check against occurence of control characters
        #     this needs to come **before** the backslashed character interpretation.
        #     NOTE: A backslashed character can be a whitespace (for example '\n'). 
        #     (check against 0xFF to avoid overflow in function 'chr()') 
        if char_code < 0xFF \
           and (chr(char_code) in CONTROL_CHARACTERS or chr(char_code).isspace()):
               stream.seek(-1, 1) 
               break 

        # (2) treat backslashed characters
        if char_code == ord('\\'):
            stream.seek(-1, 1)
            trigger_set = character_set_expression.snap_property_set(stream)
            if trigger_set == None:
                stream.seek(1, 1)  # snap_property_set() leaves tream right before '\\'
                char_code = snap_backslashed_character.do(stream)
                if char_code == None:
                    raise RegularExpressionException("Backslash followed by unrecognized character code.")
                trigger_set = char_code
        else:
            trigger_set = char_code

        # (3) read next character
        position       = stream.tell()
        next_char_code = utf8.__read_one_utf8_code_from_stream(stream)
        #    -- check for repetition (repetition has preceedence over concatination)
        if next_char_code in [ord("+"), ord("*"), ord("?"), ord("{")]:
            # (*) create state machine that consist of a single transition 
            tmp = StateMachine()
            tmp.add_transition(tmp.init_state_index, trigger_set, AcceptanceF=True)
            # -- repeat the single character state machine
            stream.seek(position)
            tmp_repeated = __snap_repetition_range(tmp, stream) 
            # -- append it to the result (last state must be set to acceptance for concatenation)
            result.states[state_index].set_acceptance()
            result = sequentialize.do([result, tmp_repeated], MountToFirstStateMachineF=True)
            # as soon as there is repetition there might be more than one acceptance
            # state and thus simple concatination via 'add_transition' fails.
            # let us return and check treat the remaining chars
            # at the next call to this function.
            return __debug_exit(result, stream)

        else:
            # (*) add new transition from current state to a new state triggering
            #     on the given character.
            state_index = result.add_transition(state_index, trigger_set)

        char_code = next_char_code

    # last character in the chain triggers an 'acceptance state'
    result.states[state_index].set_acceptance()
        
    return __debug_exit(result, stream)
Ejemplo n.º 22
0
def do(StateMachineList, CommonTerminalStateF=True, CloneF=True):
    """Connect state machines paralell.

       CommonTerminalStateF tells wether the state machines shall trigger 
                            to a common terminal. This is necessary if the
                            state machines are part of a bigger construction.

                            When the ready-to-rumble pattern state machines
                            are to be combined into a single analyzer, the
                            flag must be set to 'False'.

       CloneF               Controls if state machine list is cloned or not.
                            If the single state machines are no longer required after
                            construction, the CloneF can be set to False.

                            If Cloning is disabled the state machines themselves
                            will be altered--which brings some advantage in speed.
    """
    assert type(StateMachineList) == list
    assert len(StateMachineList) != 0
    assert map(lambda x: x.__class__.__name__, StateMachineList) == ["StateMachine"] * len(StateMachineList)
              
    # filter out empty state machines from the consideration          
    state_machine_list = filter(lambda sm: not sm.is_empty(), StateMachineList)
    empty_state_machine_occured_f = len(state_machine_list) != len(StateMachineList)

    if len(state_machine_list) < 2:
        if len(state_machine_list) < 1: result = StateMachine()
        else:                           result = state_machine_list[0]
        if empty_state_machine_occured_f:
            result = __add_free_pass(result)
        return result

    # (*) need to clone the state machines, i.e. provide their internal
    #     states with new ids, but the 'behavior' remains. This allows
    #     state machines to appear twice, or being used in 'larger'
    #     conglomerates.
    if CloneF:
        clone_list = map(lambda sm: sm.clone(), state_machine_list)
    else:
        clone_list = state_machine_list

    # (*) collect all transitions from both state machines into a single one
    #     (clone to ensure unique identifiers of states)
    result = StateMachine()
    for clone in clone_list:
        result.states.update(clone.states)

    # (*) add additional **init** and **end** state
    #     NOTE: when the result state machine was created, it already contains a 
    #           new initial state index. thus at this point only the new terminal
    #           state has to be created. 
    #     NOTE: it is essential that the acceptance flag stays False, at this
    #           point in time, so that the mounting operations only happen on
    #           the old acceptance states. Later the acceptance state is raised
    #           to 'accepted' (see below)
    new_terminal_state_index = -1L
    if CommonTerminalStateF:
        new_terminal_state_index = result.create_new_state() 
    
    # (*) Connect from the new initial state to the initial states of the
    #     clones via epsilon transition. 
    #     Connect from each success state of the clones to the new end state
    #     via epsilon transition.
    for clone in clone_list:
        result.mount_to_initial_state(clone.init_state_index)
        if CommonTerminalStateF:
            result.mount_to_acceptance_states(new_terminal_state_index,
                                              CancelStartAcceptanceStateF=True,
                                              RaiseTargetAcceptanceStateF=True,
                                              LeaveStoreInputPositionsF=True)


    # (*) If there was an empty state machine, a 'free pass' is added
    if empty_state_machine_occured_f:
        result = __add_free_pass(result, new_terminal_state_index)

    return result
Ejemplo n.º 23
0
def do(the_state_machines):
    """Connect state machines paralell."""
    assert type(the_state_machines) == list
    assert len(the_state_machines) != 0
    assert map(lambda x: x.__class__.__name__, the_state_machines) == ["StateMachine"] * len(the_state_machines)
              
    # filter out empty state machines from the consideration          
    state_machines = filter(lambda sm: not sm.is_empty(), the_state_machines)

    def __add_optional_free_pass(result_state_machine,
                                 TerminationStateIdx=-1):
        """Add an optional 'free pass' if there was an empty state."""  
        # if there was an empty state, then the number of elements in the list changed
        # in case there was an empty state one has to add a 'free pass' from begin to 
        # the final acceptance state.   
        if TerminationStateIdx == -1:
            acceptance_state_index_list = result_state_machine.get_acceptance_state_index_list()
            assert acceptance_state_index_list != [], \
                   "resulting state machine has no acceptance state!"
            TerminationStateIdx = acceptance_state_index_list[0]

        if len(state_machines) != len(the_state_machines):
            result_state_machine.add_epsilon_transition(result_state_machine.init_state_index, 
                                                        TerminationStateIdx)
        return result_state_machine

    if len(state_machines) < 2:
        if len(state_machines) < 1: return __add_optional_free_pass(StateMachine())
        else:                       return __add_optional_free_pass(state_machines[0])

    # (*) need to clone the state machines, i.e. provide their internal
    #     states with new ids, but the 'behavior' remains. This allows
    #     state machines to appear twice, or being used in 'larger'
    #     conglomerates.
    clone_list = map(lambda sm: sm.clone(), state_machines)

    # (*) collect all transitions from both state machines into a single one
    #     (clone to ensure unique identifiers of states)
    result = StateMachine()
    for clone in clone_list:
        for start_state_index, states in clone.states.items():        
            # DOUBT: is deepcopy necessary at this place?
            # ANSWER: it does not harm, because no new state indices are creates
            result.states[start_state_index] = deepcopy(states)

    # (*) add additional **init** and **end** state
    #     NOTE: when the result state machine was created, it already contains a 
    #           new initial state index. thus at this point only the new terminal
    #           state has to be created. 
    #     NOTE: it is essential that the acceptance flag stays False, at this
    #           point in time, so that the mounting operations only happen on
    #           the old acceptance states. Later the acceptance state is raised
    #           to 'accepted' (see below)
    new_terminal_state_index = result.create_new_state() 
    
    # (*) connect from the new initial state to the initial states of the
    #     clones via epsilon transition. 
    #     connect from each success state of the clones to the new end state
    #     via epsilon transition.
    for clone in clone_list:
        result.mount_to_initial_state(clone.init_state_index)
        result.mount_to_acceptance_states(new_terminal_state_index,
                                          CancelStartAcceptanceStateF=True,
                                          RaiseTargetAcceptanceStateF=True,
                                          LeaveStoreInputPositionsF=True)


    return __add_optional_free_pass(result, new_terminal_state_index)
Ejemplo n.º 24
0
def snap_primary(stream, PatternDict):
    """primary:  " non_double_quote *  "              = character string
                 [ non_rect_bracket_close ]           = set of characters
                 { identifier }                       = pattern replacement
                 ( expression )
                 non_control_character+               = lonely characters
                 primary repetition_cmd
    """
    __debug_entry("primary", stream)
    x = stream.read(1)
    lookahead = stream.read(1)
    if x != "" and lookahead != "": stream.seek(-1, 1)
    if x == "": return __debug_exit(None, stream)

    # -- 'primary' primary
    if x == "\"": result = snap_character_string.do(stream)
    elif x == "[":
        stream.seek(-1, 1)
        result = character_set_expression.do(stream, PatternDict)
    elif x == "{":
        result = snap_replacement(stream, PatternDict)
    elif x == ".":
        result = create_ALL_BUT_NEWLINE_state_machine()
    elif x == "(":
        result = snap_bracketed_expression(stream, PatternDict)

    elif x.isspace():
        # a lonestanding space ends the regular expression
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    elif x in ["*", "+", "?"]:
        raise RegularExpressionException(
            "lonely operator '%s' without expression proceeding." % x)

    elif x == "\\":
        if lookahead == "C":
            stream.read(1)
            result = snap_case_folded_pattern(stream, PatternDict)
        else:
            stream.seek(-1, 1)
            trigger_set = character_set_expression.snap_property_set(stream)
            if trigger_set == None:
                stream.seek(
                    1, 1)  # snap_property_set() leaves tream right before '\\'
                char_code = snap_backslashed_character.do(stream)
                if char_code == None:
                    raise RegularExpressionException(
                        "Backslash followed by unrecognized character code.")
                trigger_set = char_code
            result = StateMachine()
            result.add_transition(result.init_state_index,
                                  trigger_set,
                                  AcceptanceF=True)

    elif x not in CONTROL_CHARACTERS:
        # NOTE: The '\' is not inside the control characters---for a reason.
        #       It is used to define for example character codes using '\x' etc.
        stream.seek(-1, 1)
        result = snap_non_control_character(stream, PatternDict)

    else:
        # NOTE: This includes the '$' sign which means 'end of line'
        #       because the '$' sign is in CONTROL_CHARACTERS, but is not checked
        #       against. Thus, it it good to leave here on '$' because the
        #       '$' sign is handled on the very top level.
        # this is not a valid primary
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    # -- optional repetition command?
    result_repeated = __snap_repetition_range(result, stream)
    ## print "##imr:", result.get_string(NormalizeF=False)
    if result_repeated != None: result = result_repeated
    return __debug_exit(construct.beautify(result), stream)
Ejemplo n.º 25
0
def parse_mode_option(fh, new_mode):
    LanguageDB = Setup.language_db

    def fit_state_machine(SM):
        if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM)
        else:                         result = SM
        result = hopcroft.do(result, CreateNewStateMachineF=False)
        return result

    identifier = read_option_start(fh)
    if identifier == None: return False

    verify_word_in_list(identifier, lexer_mode.mode_option_info_db.keys(),
                        "mode option", fh.name, get_current_line_info_number(fh))

    if identifier == "skip":
        # A skipper 'eats' characters at the beginning of a pattern that belong
        # to a specified set of characters. A useful application is most probably
        # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to
        # implement a very effective way to skip these regions.
        pattern_str, trigger_set = regular_expression.parse_character_set(fh, PatternStringF=True)
        skip_whitespace(fh)

        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'." % identifier, fh)

        if trigger_set.is_empty():
            error_msg("Empty trigger set for skipper." % identifier, fh)

        # TriggerSet skipping is implemented the following way: As soon as one element of the 
        # trigger set appears, the state machine enters the 'trigger set skipper section'.
        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)'
        pattern_sm  = StateMachine()
        pattern_sm.add_transition(pattern_sm.init_state_index, trigger_set, AcceptanceF=True)

        # Skipper code is to be generated later
        action = GeneratedCode(skip_character_set.do, 
                               FileName = fh.name, 
                               LineN    = get_current_line_info_number(fh))
        action.data["character_set"] = trigger_set

        pattern_sm = fit_state_machine(pattern_sm)
        # For skippers line and column counting detection is not really a topic
        # It is done in the skipper itself.
        pattern_sm.side_info = SideInfo()

        new_mode.add_match(pattern_str, action, pattern_sm)

        return True

    elif identifier in ["skip_range", "skip_nested_range"]:
        # A non-nesting skipper can contain a full fledged regular expression as opener,
        # since it only effects the trigger. Not so the nested range skipper-see below.

        # -- opener
        skip_whitespace(fh)
        if identifier == "skip_nested_range":
            # Nested range state machines only accept 'strings' not state machines
            opener_str, opener_sequence = parse_string_constant(fh, "Opener pattern for 'skip_nested_range'")
            
            opener_sm = StateMachine()
            idx = opener_sm.init_state_index
            for letter in opener_sequence:
                idx = opener_sm.add_transition(idx, letter)
            opener_sm.states[idx].set_acceptance(True)
        else:
            opener_str, opener_sm = regular_expression.parse(fh)
            # For 'range skipping' the opener sequence is not needed, only the opener state
            # machine is webbed into the pattern matching state machine.
            opener_sequence       = None

        skip_whitespace(fh)

        # -- closer
        closer_str, closer_sequence = parse_string_constant(fh, "Closing pattern for 'skip_range' or 'skip_nested_range'")
        skip_whitespace(fh)
        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'" % identifier, fh)

        # Skipper code is to be generated later
        generator_function = { 
                "skip_range":        skip_range.do,
                "skip_nested_range": skip_nested_range.do,
        }[identifier]
        action = GeneratedCode(generator_function,
                               FileName = fh.name, 
                               LineN    = get_current_line_info_number(fh))

        action.data["opener_sequence"] = opener_sequence
        action.data["closer_sequence"] = closer_sequence
        action.data["mode_name"]       = new_mode.name

        fit_state_machine(opener_sm)

        # For skippers line and column counting detection is not really a topic
        # It is done in the skipper itself.
        opener_sm.side_info = SideInfo()

        new_mode.add_match(opener_str, action, opener_sm)

        return True
        
    elif identifier == "indentation":
        value = indentation_setup.do(fh)

        # Enter 'Newline' and 'Suppressed Newline' as matches into the engine.
        # Similar to skippers, the indentation count is then triggered by the newline.
        # -- Suppressed Newline = Suppressor followed by Newline,
        #    then newline does not trigger indentation counting.
        suppressed_newline_pattern = ""
        if value.newline_suppressor_state_machine.get() != None:
            suppressed_newline_pattern = \
                  "(" + value.newline_suppressor_state_machine.pattern_str + ")" \
                + "(" + value.newline_state_machine.pattern_str + ")"
                                           
            suppressed_newline_sm = \
                sequentialize.do([value.newline_suppressor_state_machine.get(),
                                  value.newline_state_machine.get()])
                 
            FileName = value.newline_suppressor_state_machine.file_name
            LineN    = value.newline_suppressor_state_machine.line_n
            # Go back to start.
            code_fragment = UserCodeFragment("goto %s;" % get_label("$start", U=True), FileName, LineN)

            suppressed_newline_sm = fit_state_machine(suppressed_newline_sm)

            # Analyze pattern for constant number of newlines, characters, etc.
            suppressed_newline_sm.side_info = SideInfo(
                    character_counter.get_newline_n(suppressed_newline_sm),
                    character_counter.get_character_n(suppressed_newline_sm))

            new_mode.add_match(suppressed_newline_pattern, code_fragment, suppressed_newline_sm,
                               Comment="indentation newline suppressor")

        # When there is an empty line, then there shall be no indentation count on it.
        # Here comes the trick: 
        #
        #      Let               newline         
        #      be defined as:    newline ([space]* newline])*
        # 
        # This way empty lines are eating away before the indentation count is activated.

        # -- 'space'
        x0 = StateMachine()
        x0.add_transition(x0.init_state_index, value.indentation_count_character_set(), 
                          AcceptanceF=True)
        # -- '[space]*'
        x1 = repeat.do(x0)
        # -- '[space]* newline'
        x2 = sequentialize.do([x1, value.newline_state_machine.get()])
        # -- '([space]* newline)*'
        x3 = repeat.do(x2)
        # -- 'newline ([space]* newline)*'
        x4 = sequentialize.do([value.newline_state_machine.get(), x3])
        # -- nfa to dfa; hopcroft optimization
        sm = hopcroft.do(nfa_to_dfa.do(x4), CreateNewStateMachineF=False)

        FileName = value.newline_state_machine.file_name
        LineN    = value.newline_state_machine.line_n
        action   = GeneratedCode(indentation_counter.do, FileName, LineN)

        action.data["indentation_setup"] = value

        sm = fit_state_machine(sm)
        sm.side_info = SideInfo(character_counter.get_newline_n(sm),
                                character_counter.get_character_n(sm))
        new_mode.add_match(value.newline_state_machine.pattern_str,
                           action, sm, Comment="indentation newline")

        # Announce the mode to which the setup belongs
        value.set_containing_mode_name(new_mode.name)
    else:
        value = read_option_value(fh)

    # The 'verify_word_in_list()' call must have ensured that the following holds
    assert lexer_mode.mode_option_info_db.has_key(identifier)

    # Is the option of the appropriate value?
    option_info = lexer_mode.mode_option_info_db[identifier]
    if option_info.domain != None and value not in option_info.domain:
        error_msg("Tried to set value '%s' for option '%s'. " % (Value, Option) + \
                  "Though, possible for this option are only: %s." % repr(oi.domain)[1:-1], fh)

    # Finally, set the option
    new_mode.add_option(identifier, value)

    return True
Ejemplo n.º 26
0
def snap_non_control_characters(stream):
    """Snaps any 'non_control_character' using UTF8 encoding from the given string. Note, that 
       in UTF8 a character may consist of more than one byte. Creates a state machine 
       that contains solely one trigger for each character to a acceptance state.

       This function **concatinates** incoming characters, but **repetition** has preceedence
       over concatination, so it checks after each character whether it is followed by
       a repetition ('*', '+', '?', '{..}'). In such a case, the repetition of the character
       is appended.
    """
    __debug_entry("non-control characters", stream)

    result = StateMachine()
    state_index = result.init_state_index
    # (*) read first character
    position = stream.tell()
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    while char_code != 0xFF:
        # (1) check against occurence of control characters
        #     this needs to come **before** the backslashed character interpretation.
        #     NOTE: A backslashed character can be a whitespace (for example '\n').
        #     (check against 0xFF to avoid overflow in function 'chr()')
        if char_code < 0xFF \
           and (chr(char_code) in CONTROL_CHARACTERS or chr(char_code).isspace()):
            stream.seek(-1, 1)
            break

        # (2) treat backslashed characters
        if char_code == ord('\\'):
            stream.seek(-1, 1)
            trigger_set = character_set_expression.snap_property_set(stream)
            if trigger_set == None:
                stream.seek(
                    1, 1)  # snap_property_set() leaves tream right before '\\'
                char_code = snap_backslashed_character.do(stream)
                if char_code == None:
                    raise RegularExpressionException(
                        "Backslash followed by unrecognized character code.")
                trigger_set = char_code
        else:
            trigger_set = char_code

        # (3) read next character
        position = stream.tell()
        next_char_code = utf8.__read_one_utf8_code_from_stream(stream)
        #    -- check for repetition (repetition has preceedence over concatination)
        if next_char_code in [ord("+"), ord("*"), ord("?"), ord("{")]:
            # (*) create state machine that consist of a single transition
            tmp = StateMachine()
            tmp.add_transition(tmp.init_state_index,
                               trigger_set,
                               AcceptanceF=True)
            # -- repeat the single character state machine
            stream.seek(position)
            tmp_repeated = __snap_repetition_range(tmp, stream)
            # -- append it to the result (last state must be set to acceptance for concatenation)
            result.states[state_index].set_acceptance()
            result = sequentialize.do([result, tmp_repeated],
                                      MountToFirstStateMachineF=True)
            # as soon as there is repetition there might be more than one acceptance
            # state and thus simple concatination via 'add_transition' fails.
            # let us return and check treat the remaining chars
            # at the next call to this function.
            return __debug_exit(result, stream)

        else:
            # (*) add new transition from current state to a new state triggering
            #     on the given character.
            state_index = result.add_transition(state_index, trigger_set)

        char_code = next_char_code

    # last character in the chain triggers an 'acceptance state'
    result.states[state_index].set_acceptance()

    return __debug_exit(result, stream)
Ejemplo n.º 27
0
def do(SM):
    """Creates a deterministic finite automaton (DFA) from the current state 
       machine - which may be a NFA (non-deterministic finite automaton). This is
       a generlized version of the 'subset construction' algorithm. Where 
       subsection construction focusses on letters of an alphabet for the
       investigation of transitions, this algorithm focusses on elementary
       trigger sets. A very good description of the subset construction 
       algorithm can be found in 'Engineering a Compiler' by Keith Cooper.
    """
    # (*) create the result state machine
    initial_state_epsilon_closure = SM.get_epsilon_closure(SM.init_state_index) 

    # NOTE: Later on, state machines with an initial acceptance state are forbidden.
    #       So, acceptance is not a question here. Think about setting it to false anyway.
    result = StateMachine(Core = SM.core())

    # (*) initial state of resulting DFA = epsilon closure of initial state of NFA
    #     -- add the origin list of all states in the epsilon closure
    new_init_state = result.get_init_state()
    for state in map(lambda idx: SM.states[idx], initial_state_epsilon_closure):
        new_init_state.merge(state)

    # (*) prepare the initial worklist
    worklist = [ ( result.init_state_index, initial_state_epsilon_closure) ]

    epsilon_closure_db = SM.get_epsilon_closure_db()

    while worklist != []:
        # 'start_state_index' is the index of an **existing** state in the state machine.
        # It was either created above, in StateMachine's constructor, or as a target
        # state index.
        start_state_index, start_state_combination = worklist.pop()
 
        # (*) compute the elementary trigger sets together with the 
        #     epsilon closure of target state combinations that they trigger to.
        #     In other words: find the ranges of characters where the state triggers to
        #     a unique state combination. E.g:
        #                Range        Target State Combination 
        #                [0:23]   --> [ State1, State2, State10 ]
        #                [24:60]  --> [ State1 ]
        #                [61:123] --> [ State2, State10 ]
        #
        elementary_trigger_set_infos = SM.get_elementary_trigger_sets(start_state_combination,
                                                                      epsilon_closure_db)
        ## DEBUG_print(start_state_combination, elementary_trigger_set_infos)

        # (*) loop over all elementary trigger sets
        for epsilon_closure_of_target_state_combination, trigger_set in elementary_trigger_set_infos:
            #  -- if there is no trigger to the given target state combination, then drop it
            if trigger_set.is_empty(): continue

            # -- add a new target state representing the state combination
            #    (if this did not happen yet)
            target_state_index = \
                 map_state_combination_to_index(epsilon_closure_of_target_state_combination)

            # -- if target state combination was not considered yet, then create 
            #    a new state in the state machine
            if result.states.has_key(target_state_index):
                # -- add only a transition 'start state to target state'
                result.add_transition(start_state_index, trigger_set, target_state_index)
            else:
                # -- add the transition 'start state to target state'
                #    (create implicitly the new target state in the state machine)
                result.add_transition(start_state_index, trigger_set, target_state_index)
                # -- merge informations of combined states inside the target state
                new_target_state = result.states[target_state_index]
                for state in map(lambda idx: SM.states[idx], epsilon_closure_of_target_state_combination):
                    new_target_state.merge(state)

                worklist.append((target_state_index, epsilon_closure_of_target_state_combination))  

    return result 
Ejemplo n.º 28
0
def parse_mode_option(fh, new_mode):
    skip_whitespace(fh)

    # (*) base modes 
    if fh.read(1) != "<": return False

    skip_whitespace(fh)

    identifier = read_identifier(fh).strip()

    if identifier == "":  error_msg("missing identifer after start of mode option '<'", fh)
    skip_whitespace(fh)
    if fh.read(1) != ":": error_msg("missing ':' after option name '%s'" % identifier, fh)
    skip_whitespace(fh)

    if identifier == "skip":
        # A skipper 'eats' characters at the beginning of a pattern that belong
        # to a specified set of characters. A useful application is most probably
        # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to
        # implement a very effective way to skip these regions.
        pattern_str, trigger_set = regular_expression.parse_character_set(fh, PatternStringF=True)
        skip_whitespace(fh)

        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'." % identifier, fh)

        if trigger_set.is_empty():
            error_msg("Empty trigger set for skipper." % identifier, fh)

        # TriggerSet skipping is implemented the following way: As soon as one element of the 
        # trigger set appears, the state machine enters the 'trigger set skipper section'.
        opener_sm = StateMachine()
        opener_sm.add_transition(opener_sm.init_state_index, trigger_set, AcceptanceF=True)
            
        action = CodeFragment(create_skip_code(trigger_set))
 
        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        new_mode.add_match(pattern_str, action, opener_sm)

        return True

    elif identifier == "skip_range":
        # A non-nesting skipper can contain a full fledged regular expression as opener,
        # since it only effects the trigger. Not so the nested range skipper-see below.

        # -- opener
        skip_whitespace(fh)
        opener_str, opener_sm = regular_expression.parse(fh)
        skip_whitespace(fh)

        # -- closer
        if fh.read(1) != "\"":
            error_msg("closing pattern for skip_range can only be a string and must start with a quote like \".", fh)
        closer_sequence = snap_character_string.get_character_code_sequence(fh)
        skip_whitespace(fh)
        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'" % identifier, fh)

        action = CodeFragment(create_skip_range_code(closer_sequence))

        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        new_mode.add_match(opener_str, action, opener_sm)
        return True
        
    elif identifier == "skip_nesting_range":
        error_msg("skip_nesting_range is not yet supported.", fh)

    else:
        value, i = read_until_letter(fh, [">"], Verbose=1)
        if i != 0:
            error_msg("missing closing '>' for mode option '%s'" % identifier, fh)

        value = value.strip()

    # Does the specified option actually exist?
    if not lexer_mode.mode_option_info_db.has_key(identifier):
        error_msg("tried to set option '%s' which does not exist!\n" % identifier + \
                  "options are %s" % repr(lexer_mode.mode_option_info_db.keys()), fh)

    # Is the option of the appropriate value?
    option_info = lexer_mode.mode_option_info_db[identifier]
    if option_info.type != "list" and value not in option_info.domain:
        error_msg("Tried to set value '%s' for option '%s'. " % (Value, Option) + \
                  "Though, possible \n" + \
                  "for this option are %s" % repr(oi.domain), fh)

    # Finally, set the option
    new_mode.add_option(identifier, value)

    return True
Ejemplo n.º 29
0
def parse_mode_option(fh, new_mode):
    skip_whitespace(fh)

    # (*) base modes
    if fh.read(1) != "<": return False

    skip_whitespace(fh)

    identifier = read_identifier(fh).strip()

    if identifier == "":
        error_msg("missing identifer after start of mode option '<'", fh)
    skip_whitespace(fh)
    if fh.read(1) != ":":
        error_msg("missing ':' after option name '%s'" % identifier, fh)
    skip_whitespace(fh)

    if identifier == "skip":
        # A skipper 'eats' characters at the beginning of a pattern that belong
        # to a specified set of characters. A useful application is most probably
        # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to
        # implement a very effective way to skip these regions.
        pattern_str, trigger_set = regular_expression.parse_character_set(
            fh, PatternStringF=True)
        skip_whitespace(fh)

        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'." % identifier,
                      fh)

        if trigger_set.is_empty():
            error_msg("Empty trigger set for skipper." % identifier, fh)

        # TriggerSet skipping is implemented the following way: As soon as one element of the
        # trigger set appears, the state machine enters the 'trigger set skipper section'.
        opener_sm = StateMachine()
        opener_sm.add_transition(opener_sm.init_state_index,
                                 trigger_set,
                                 AcceptanceF=True)

        action = CodeFragment(create_skip_code(trigger_set))

        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        new_mode.add_match(pattern_str, action, opener_sm)

        return True

    elif identifier == "skip_range":
        # A non-nesting skipper can contain a full fledged regular expression as opener,
        # since it only effects the trigger. Not so the nested range skipper-see below.

        # -- opener
        skip_whitespace(fh)
        opener_str, opener_sm = regular_expression.parse(fh)
        skip_whitespace(fh)

        # -- closer
        if fh.read(1) != "\"":
            error_msg(
                "closing pattern for skip_range can only be a string and must start with a quote like \".",
                fh)
        closer_sequence = snap_character_string.get_character_code_sequence(fh)
        skip_whitespace(fh)
        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'" % identifier,
                      fh)

        action = CodeFragment(create_skip_range_code(closer_sequence))

        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        new_mode.add_match(opener_str, action, opener_sm)
        return True

    elif identifier == "skip_nesting_range":
        error_msg("skip_nesting_range is not yet supported.", fh)

    else:
        value, i = read_until_letter(fh, [">"], Verbose=1)
        if i != 0:
            error_msg("missing closing '>' for mode option '%s'" % identifier,
                      fh)

        value = value.strip()

    # Does the specified option actually exist?
    if not lexer_mode.mode_option_info_db.has_key(identifier):
        error_msg("tried to set option '%s' which does not exist!\n" % identifier + \
                  "options are %s" % repr(lexer_mode.mode_option_info_db.keys()), fh)

    # Is the option of the appropriate value?
    option_info = lexer_mode.mode_option_info_db[identifier]
    if option_info.type != "list" and value not in option_info.domain:
        error_msg("Tried to set value '%s' for option '%s'. " % (Value, Option) + \
                  "Though, possible \n" + \
                  "for this option are %s" % repr(oi.domain), fh)

    # Finally, set the option
    new_mode.add_option(identifier, value)

    return True