def __get_inverse_state_machine_that_finds_end_of_core_expression(PostConditionSM):
    """In case of a pseudo-ambiguous post condition one needs to go backwards
       in order to search for the end of the core condition. This function 
       creates the inverse state machine that is able to go backwards.

       NOTE: This is a special case, because one already knows that the state
       machine reaches the acceptance state sometime (this is where it actually
       started). That means, that in states other than acceptance states one
       can take out the 'drop out' triggers since they CANNOT occur. This
       enables some speed-up when going backwards.
    """
    result = PostConditionSM.get_inverse()
    result = nfa_to_dfa.do(result)
    result = hopcroft.do(result)

    # -- delete 'drop-out' transitions in non-acceptance states
    #    NOTE: When going backwards one already knows that the acceptance
    #          state (the init state of the post condition) is reached, see above.
    for state in result.states.values():
        # -- acceptance states can have 'drop-out' (actually, they need to have)
        if state.is_acceptance(): continue

        state.transitions().replace_drop_out_target_states_with_adjacent_targets()

    result = nfa_to_dfa.do(result)
    result = hopcroft.do(result)

    # Acceptance States need to be marked: Store input position.
    # NOTE: When tracing backwards the match is guaranteed, but there might
    #       still be some 'trail' in case of iterations that are not directly
    #       iterated to the ambiguous post condition. Thus drop out may
    #       happen and it must be clear where to put the input pointer in this case.

    return result
def philosophical_cut(core_sm, post_context_sm):
    """The 'philosophical cut' is a technique introduced by Frank-Rene Schaefer
       to produce a pair of a core- and a post-condition that otherwise would 
       be forward and backward ambiguous. The philosophical ground for this
       cut is 'greed', i.e. a core pattern should eat as much characters as
       it can. This idea is followed during the whole construction of the lexical
       analyzer. 
       
       For the case of total ambiguity 'x+/x+', this idea translates into leaving
       the iteration in the core condition and cutting the iteration in the post
       condition. Thus 'x+/x+' is transformed into 'x+/x' and can be solved by
       the technique for forward ambiguous post conditions.
    """
    core_acceptance_state_list = core_sm.get_acceptance_state_list()

    pcsm_init_state = post_context_sm.get_init_state()
    for csm_state in core_acceptance_state_list:
        __dive_to_cut_iteration(core_sm, csm_state, post_context_sm, pcsm_init_state,
                                SM1_Path=[post_context_sm.init_state_index])

    # By means of cutting, some states might have become bold. That is, they have
    # only an epsilon transition. Thus, it is required to do a transformation NFA->DFA
    # and add a hopcroft optimization.
    new_post_sm = nfa_to_dfa.do(post_context_sm)
    new_post_sm = hopcroft.do(new_post_sm)
    return new_post_sm
Ejemplo n.º 3
0
def philosophical_cut(core_sm, post_context_sm):
    """The 'philosophical cut' is a technique introduced by Frank-Rene Schaefer
       to produce a pair of a core- and a post-condition that otherwise would 
       be forward and backward ambiguous. The philosophical ground for this
       cut is 'greed', i.e. a core pattern should eat as much characters as
       it can. This idea is followed during the whole construction of the lexical
       analyzer. 
       
       For the case of total ambiguity 'x+/x+', this idea translates into leaving
       the iteration in the core condition and cutting the iteration in the post
       condition. Thus 'x+/x+' is transformed into 'x+/x' and can be solved by
       the technique for forward ambiguous post conditions.
    """
    core_acceptance_state_list = core_sm.get_acceptance_state_list()

    pcsm_init_state = post_context_sm.get_init_state()
    for csm_state in core_acceptance_state_list:
        __dive_to_cut_iteration(core_sm,
                                csm_state,
                                post_context_sm,
                                pcsm_init_state,
                                SM1_Path=[post_context_sm.init_state_index])

    # By means of cutting, some states might have become bold. That is, they have
    # only an epsilon transition. Thus, it is required to do a transformation NFA->DFA
    # and add a hopcroft optimization.
    new_post_sm = nfa_to_dfa.do(post_context_sm)
    new_post_sm = hopcroft.do(new_post_sm)
    return new_post_sm
Ejemplo n.º 4
0
def do(sm):
    state_list = sm.states.items()
    for state_index, state in state_list:
        # Get the 'transition_list', i.e. a list of pairs (TargetState, NumberSet)
        # which indicates what target state is reached via what number set.
        transition_list = state.transitions().get_map().items()
        # Clear the state's transitions, now. This way it can absorb new
        # transitions to intermediate states.
        state.transitions().clear()
        # Loop over all transitions
        for target_state_index, number_set in transition_list:
            # We take the intervals with 'PromiseToTreatWellF' even though they
            # are changed. This is because the intervals would be lost anyway
            # after the state split, so we use the same memory and do not 
            # cause a time consuming memory copy and constructor calls.
            interval_list = number_set.get_intervals(PromiseToTreatWellF=True)

            # 1st check wether a modification is necessary
            modification_required_f = False
            for interval in interval_list:
                if interval.begin >= 0x10000: modification_required_f = True; break

            if modification_required_f == False:
                sm.states[state_index].add_transition(number_set, target_state_index)
                continue

            # Now, intermediate states may be added
            for interval in interval_list:
                create_intermediate_states(sm, state_index, target_state_index, interval)
    
    result = hopcroft_minimization.do(nfa_to_dfa.do(sm), CreateNewStateMachineF=False)
    return result
Ejemplo n.º 5
0
def __get_DFA_compliant_state_machine(SM):
    result = SM
    if not result.is_DFA_compliant():
        result = nfa_to_dfa.do(result)
        result = hopcroft.do(result, CreateNewStateMachineF=False)

    pre_sm = result.core().pre_context_sm()
    if pre_sm != None:
        # If pre-context state machine is not DFA compliant,
        # then make it compliant.
        if not pre_sm.is_DFA_compliant():
            pre_sm = nfa_to_dfa.do(pre_sm)
            pre_sm = hopcroft.do(pre_sm, CreateNewStateMachineF=False)
            result.replace_pre_context_state_machine(pre_sm)

    return result
Ejemplo n.º 6
0
def __get_DFA_compliant_state_machine(SM):
    result = SM
    if not result.is_DFA_compliant(): 
        result = nfa_to_dfa.do(result)
        result = hopcroft.do(result, CreateNewStateMachineF=False)

    pre_sm = result.core().pre_context_sm()
    if pre_sm != None:
        # If pre-context state machine is not DFA compliant, 
        # then make it compliant.
        if not pre_sm.is_DFA_compliant(): 
            pre_sm = nfa_to_dfa.do(pre_sm)
            pre_sm = hopcroft.do(pre_sm, CreateNewStateMachineF=False)
            result.replace_pre_context_state_machine(pre_sm)

    return result
def detect_backward(CoreStateMachine, PostConditionStateMachine):

    """A 'backward ambiguity' denotes the case where it cannot be clearly be
       determined how far to go back from the end of a post-condition. 
       
       NOTE: This does not mean that the post-condition is ambiguous. Many
       cases that are backward ambiguous can be handled by quex's normal
       post-condition handling.

       Examples:  x/x+   is backward ambiguous because in a stream
                         of 'x' one cannot determine with a pure
                         state machine where to stop. This case,
                         though can be handled by the normal post-
                         condition implementation.

                  x+/x+  is backward ambiguous and cannot be handled
                         by the normal implementation. In fact, this
                         specification does not allow any conclusions
                         about the users intend where to reset the 
                         input after match.
    """

    __assert_state_machines(CoreStateMachine, PostConditionStateMachine)

    my_post_context_sm = PostConditionStateMachine.clone()

    # (*) Create a modified version of the post condition, where the
    #     initial state is an acceptance state, and no other. This 
    #     allows the detector to trigger on 'iteration'.
    #
    # -- delete all acceptance states in the post condition
    # for state in my_post_context_sm.states.values():
    #   state.set_acceptance(False)
    # -- set the initial state as acceptance state
    # my_post_context_sm.get_init_state().set_acceptance(True)

    my_core_sm = CoreStateMachine.get_inverse()
    my_core_sm = nfa_to_dfa.do(my_core_sm)
    my_core_sm = hopcroft.do(my_core_sm)

    tmp = deepcopy(PostConditionStateMachine)
    my_post_context_sm = tmp.get_inverse()
    my_post_context_sm = nfa_to_dfa.do(my_post_context_sm)
    my_post_context_sm = hopcroft.do(my_post_context_sm)

    return detect_forward(my_post_context_sm, my_core_sm)
Ejemplo n.º 8
0
def detect_backward(CoreStateMachine, PostConditionStateMachine):
    """A 'backward ambiguity' denotes the case where it cannot be clearly be
       determined how far to go back from the end of a post-condition. 
       
       NOTE: This does not mean that the post-condition is ambiguous. Many
       cases that are backward ambiguous can be handled by quex's normal
       post-condition handling.

       Examples:  x/x+   is backward ambiguous because in a stream
                         of 'x' one cannot determine with a pure
                         state machine where to stop. This case,
                         though can be handled by the normal post-
                         condition implementation.

                  x+/x+  is backward ambiguous and cannot be handled
                         by the normal implementation. In fact, this
                         specification does not allow any conclusions
                         about the users intend where to reset the 
                         input after match.
    """

    __assert_state_machines(CoreStateMachine, PostConditionStateMachine)

    my_post_context_sm = PostConditionStateMachine.clone()

    # (*) Create a modified version of the post condition, where the
    #     initial state is an acceptance state, and no other. This
    #     allows the detector to trigger on 'iteration'.
    #
    # -- delete all acceptance states in the post condition
    # for state in my_post_context_sm.states.values():
    #   state.set_acceptance(False)
    # -- set the initial state as acceptance state
    # my_post_context_sm.get_init_state().set_acceptance(True)

    my_core_sm = CoreStateMachine.get_inverse()
    my_core_sm = nfa_to_dfa.do(my_core_sm)
    my_core_sm = hopcroft.do(my_core_sm)

    tmp = deepcopy(PostConditionStateMachine)
    my_post_context_sm = tmp.get_inverse()
    my_post_context_sm = nfa_to_dfa.do(my_post_context_sm)
    my_post_context_sm = hopcroft.do(my_post_context_sm)

    return detect_forward(my_post_context_sm, my_core_sm)
Ejemplo n.º 9
0
def __beautify(the_state_machine):
    ## assert the_state_machine.get_orphaned_state_index_list() == [], \
    ##       "before conversion to DFA: orphaned states " + repr(the_state_machine)
    result = nfa_to_dfa.do(the_state_machine)
    ## assert the_state_machine.get_orphaned_state_index_list() == [], \
    ##       "after conversion to DFA: orphaned states " + repr(the_state_machine)
    result = hopcroft.do(result)  #, CreateNewStateMachineF=False)
    ## assert the_state_machine.get_orphaned_state_index_list() == [], \
    ##       "after hopcroft minimization: orphaned states " + repr(the_state_machine)

    return result
Ejemplo n.º 10
0
def beautify(the_state_machine):
    ## assert the_state_machine.get_orphaned_state_index_list() == [], \
    ##       "before conversion to DFA: orphaned states " + repr(the_state_machine)
    result = nfa_to_dfa.do(the_state_machine)
    ## assert the_state_machine.get_orphaned_state_index_list() == [], \
    ##       "after conversion to DFA: orphaned states " + repr(the_state_machine)
    result = hopcroft.do(result, CreateNewStateMachineF=False)
    ## assert the_state_machine.get_orphaned_state_index_list() == [], \
    ##       "after hopcroft minimization: orphaned states " + repr(the_state_machine)

    return result
Ejemplo n.º 11
0
def __get_inverse_state_machine_that_finds_end_of_core_expression(
        PostConditionSM):
    """In case of a pseudo-ambiguous post condition one needs to go backwards
       in order to search for the end of the core condition. This function 
       creates the inverse state machine that is able to go backwards.

       NOTE: This is a special case, because one already knows that the state
       machine reaches the acceptance state sometime (this is where it actually
       started). That means, that in states other than acceptance states one
       can take out the 'drop out' triggers since they CANNOT occur. This
       enables some speed-up when going backwards.
    """
    result = PostConditionSM.get_inverse()
    result = nfa_to_dfa.do(result)
    result = hopcroft.do(result)

    # -- delete 'drop-out' transitions in non-acceptance states
    #    NOTE: When going backwards one already knows that the acceptance
    #          state (the init state of the post condition) is reached, see above.
    for state in result.states.values():
        # -- acceptance states can have 'drop-out' (actually, they need to have)
        if state.is_acceptance(): continue

        state.transitions(
        ).replace_drop_out_target_states_with_adjacent_targets()

    result = nfa_to_dfa.do(result)
    result = hopcroft.do(result)

    # Acceptance States need to be marked: Store input position.
    # NOTE: When tracing backwards the match is guaranteed, but there might
    #       still be some 'trail' in case of iterations that are not directly
    #       iterated to the ambiguous post condition. Thus drop out may
    #       happen and it must be clear where to put the input pointer in this case.

    return result
Ejemplo n.º 12
0
def do(the_state_machine, pre_context_state_machine):
    """Sets up a pre-condition to the given state machine. This process
       is entirely different from any sequentialization or paralellization
       of state machines. Here, the state machine representing the pre-
       condition ist **not** webbed into the original state machine!

       Instead, the following happens:

          -- the pre-condition state machine is inverted, because
             it is to be walked through backwards.
          -- the inverted state machine is marked with the state machine id
             of the_state_machine.        
          -- the original state machine will refere to the inverse
             state machine of the pre-condition.
          -- the initial state origins and the origins of the acceptance
             states are marked as 'pre-conditioned' indicating the id
             of the inverted state machine of the pre-condition.             
    """
    #___________________________________________________________________________________________
    # (*) do some consistency checking   
    assert the_state_machine.__class__.__name__ == "StateMachine"
    assert pre_context_state_machine.__class__.__name__ == "StateMachine"
    # -- state machines with no states are senseless here. 
    assert not the_state_machine.is_empty() 
    assert not pre_context_state_machine.is_empty()
    # -- trivial pre-conditions should be added last, for simplicity
    assert not the_state_machine.core().pre_context_begin_of_line_f(), \
           "This function was not designed to deal with trivially pre-conditioned state machines." + \
           "Please, make sure the trivial pre-conditioning happens *after* regular pre-conditions."  
    #___________________________________________________________________________________________
        
    # (*) invert the state machine of the pre-condition 
    inverse_pre_context = pre_context_state_machine.get_inverse()
    inverse_pre_context = nfa_to_dfa.do(inverse_pre_context)
    inverse_pre_context = hopcroft.do(inverse_pre_context)
        
    # (*) let the state machine refer to it 
    #     [Is this necessary? Is it not enough that the acceptance origins point to it? <fschaef>]
    the_state_machine.core().set_pre_context_sm(inverse_pre_context)
    pre_context_sm_id = inverse_pre_context.get_id()

    # (*) create origin data, in case where there is none yet create new one.
    #     (do not delete, otherwise existing information gets lost)
    for state in the_state_machine.states.values():
        if not state.is_acceptance(): continue
        state.core().set_pre_context_id(pre_context_sm_id)
    
    return the_state_machine
Ejemplo n.º 13
0
def do(the_state_machine, pre_context_state_machine):
    """Sets up a pre-condition to the given state machine. This process
       is entirely different from any sequentialization or paralellization
       of state machines. Here, the state machine representing the pre-
       condition ist **not** webbed into the original state machine!

       Instead, the following happens:

          -- the pre-condition state machine is inverted, because
             it is to be walked through backwards.
          -- the inverted state machine is marked with the state machine id
             of the_state_machine.        
          -- the original state machine will refere to the inverse
             state machine of the pre-condition.
          -- the initial state origins and the origins of the acceptance
             states are marked as 'pre-conditioned' indicating the id
             of the inverted state machine of the pre-condition.             
    """
    #___________________________________________________________________________________________
    # (*) do some consistency checking
    assert the_state_machine.__class__.__name__ == "StateMachine"
    assert pre_context_state_machine.__class__.__name__ == "StateMachine"
    # -- state machines with no states are senseless here.
    assert not the_state_machine.is_empty()
    assert not pre_context_state_machine.is_empty()
    # -- trivial pre-conditions should be added last, for simplicity
    assert not the_state_machine.core().pre_context_begin_of_line_f(), \
           "This function was not designed to deal with trivially pre-conditioned state machines." + \
           "Please, make sure the trivial pre-conditioning happens *after* regular pre-conditions."
    #___________________________________________________________________________________________

    # (*) invert the state machine of the pre-condition
    inverse_pre_context = pre_context_state_machine.get_inverse()
    inverse_pre_context = nfa_to_dfa.do(inverse_pre_context)
    inverse_pre_context = hopcroft.do(inverse_pre_context)

    # (*) let the state machine refer to it
    #     [Is this necessary? Is it not enough that the acceptance origins point to it? <fschaef>]
    the_state_machine.core().set_pre_context_sm(inverse_pre_context)
    pre_context_sm_id = inverse_pre_context.get_id()

    # (*) create origin data, in case where there is none yet create new one.
    #     (do not delete, otherwise existing information gets lost)
    for state in the_state_machine.states.values():
        if not state.is_acceptance(): continue
        state.core().set_pre_context_id(pre_context_sm_id)

    return the_state_machine
Ejemplo n.º 14
0
def do(sm):
    """The UTF8 encoding causes a single unicode character code being translated
       into a sequence of bytes. A state machine triggering on unicode characters
       can be converted into a state machine triggering on UTF8 bytes.

       For this a simple transition on a character 'X':

            [ 1 ]---( X )--->[ 2 ]

       needs to be translated into a sequence of state transitions

            [ 1 ]---(x0)--->[ S0 ]---(x1)--->[ S1 ]---(x2)--->[ 2 ]

       where, x0, x1, x2 are the UTF8 bytes that represent unicode 'X'. 
       States S0 and S1 are intermediate states created only so that
       x1, x2, and x3 can trigger. Note, that the UTF8 sequence ends
       at the same state '2' as the previous single trigger 'X'.
    """
    state_list = sm.states.items()
    for state_index, state in state_list:
        # Get the 'transition_list', i.e. a list of pairs (TargetState, NumberSet)
        # which indicates what target state is reached via what number set.
        transition_list = state.transitions().get_map().items()
        # Clear the state's transitions, now. This way it can absorb new
        # transitions to intermediate states.
        state.transitions().clear()
        # Loop over all transitions
        for target_state_index, number_set in transition_list:
            # We take the intervals with 'PromiseToTreatWellF' even though they
            # are changed. This is because the intervals would be lost anyway
            # after the state split, so we use the same memory and do not
            # cause a time consuming memory copy and constructor calls.
            for interval in number_set.get_intervals(PromiseToTreatWellF=True):
                create_intermediate_states(sm, state_index, target_state_index,
                                           interval)

    return hopcroft_minimization.do(nfa_to_dfa.do(sm),
                                    CreateNewStateMachineF=False)
Ejemplo n.º 15
0
def do(sm):
    """The UTF8 encoding causes a single unicode character code being translated
       into a sequence of bytes. A state machine triggering on unicode characters
       can be converted into a state machine triggering on UTF8 bytes.

       For this a simple transition on a character 'X':

            [ 1 ]---( X )--->[ 2 ]

       needs to be translated into a sequence of state transitions

            [ 1 ]---(x0)--->[ S0 ]---(x1)--->[ S1 ]---(x2)--->[ 2 ]

       where, x0, x1, x2 are the UTF8 bytes that represent unicode 'X'. 
       States S0 and S1 are intermediate states created only so that
       x1, x2, and x3 can trigger. Note, that the UTF8 sequence ends
       at the same state '2' as the previous single trigger 'X'.
    """
    state_list = sm.states.items()
    for state_index, state in state_list:
        # Get the 'transition_list', i.e. a list of pairs (TargetState, NumberSet)
        # which indicates what target state is reached via what number set.
        transition_list = state.transitions().get_map().items()
        # Clear the state's transitions, now. This way it can absorb new
        # transitions to intermediate states.
        state.transitions().clear()
        # Loop over all transitions
        for target_state_index, number_set in transition_list:
            # We take the intervals with 'PromiseToTreatWellF' even though they
            # are changed. This is because the intervals would be lost anyway
            # after the state split, so we use the same memory and do not 
            # cause a time consuming memory copy and constructor calls.
            for interval in number_set.get_intervals(PromiseToTreatWellF=True):
                create_intermediate_states(sm, state_index, target_state_index, interval)

    return hopcroft_minimization.do(nfa_to_dfa.do(sm), CreateNewStateMachineF=False)
Ejemplo n.º 16
0
 def fit_state_machine(SM):
     if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM)
     else:                         result = SM
     result = hopcroft.do(result, CreateNewStateMachineF=False)
     return result
Ejemplo n.º 17
0
def get_combined_state_machine(StateMachine_List,
                               FilterDominatedOriginsF=True):
    """Creates a DFA state machine that incorporates the paralell
       process of all pattern passed as state machines in 
       the StateMachine_List. Each origins of each state machine
       are kept in the final state, if it is not dominated.

       Performs: -- parallelization
                 -- translation from NFA to DFA
                 -- Frank Schaefers Adapted Hopcroft optimization.

       Again: The state machine ids of the original state machines
              are traced through the whole process.
              
       FilterDominatedOriginsF, if set to False, can disable the filtering
              of dominated origins. This is important for pre-conditions, because,
              all successful patterns need to be reported!            
                      
    """
    def __check(Place, sm):
        __check_on_orphan_states(Place, sm)
        __check_on_init_state_not_acceptance(Place, sm)

    def __check_on_orphan_states(Place, sm):
        orphan_state_list = sm.get_orphaned_state_index_list()
        if orphan_state_list == []: return
        error_msg("After '%s'" % Place + "\n" + \
                  "Orphaned state(s) detected in regular expression (optimization lack).\n" + \
                  "Please, log a defect at the projects website quex.sourceforge.net.\n"    + \
                  "Orphan state(s) = " + repr(orphan_state_list)                       + "\n")

    def __check_on_init_state_not_acceptance(Place, sm):
        init_state = sm.get_init_state()
        if init_state.core().is_acceptance():
            error_msg("After '%s'" % Place + "\n" + \
                      "The initial state is 'acceptance'. This should never appear.\n" + \
                      "Please, log a defect at the projects website quex.sourceforge.net.\n")

        if filter(lambda origin: origin.is_acceptance(),
                  init_state.origins().get_list()) != []:
            error_msg("After '%s'" % Place + "\n" + \
                      "Initial state contains an origin that is 'acceptance'. This should never appear.\n" + \
                      "Please, log a defect at the projects website quex.sourceforge.net.\n")

    # (1) mark at each state machine the machine and states as 'original'.
    #
    #     This is necessary to trace in the combined state machine the
    #     pattern that actually matched. Note, that a state machine in
    #     the StateMachine_List represents one possible pattern that can
    #     match the current input.
    #
    map(lambda x: x.mark_state_origins(), StateMachine_List)

    for sm in StateMachine_List:
        assert sm.is_DFA_compliant(), repr(sm)

    # (2) setup all patterns in paralell
    sm = parallelize.do(StateMachine_List,
                        CommonTerminalStateF=False)  #, CloneF=False)
    __check("Parallelization", sm)

    # (3) convert the state machine to an DFA (paralellization created an NFA)
    sm = nfa_to_dfa.do(sm)
    __check("NFA to DFA", sm)

    # (4) determine for each state in the DFA what is the dominating original state
    if FilterDominatedOriginsF: sm.filter_dominated_origins()
    __check("Filter Dominated Origins", sm)

    # (5) perform hopcroft optimization
    #     Note, that hopcroft optimization does consider the original acceptance
    #     states when deciding if two state sets are equivalent.
    sm = hopcroft.do(sm)
    __check("Hopcroft Minimization", sm)

    return sm
Ejemplo n.º 18
0
def parse_mode_option(fh, new_mode):
    LanguageDB = Setup.language_db

    def fit_state_machine(SM):
        if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM)
        else:                         result = SM
        result = hopcroft.do(result, CreateNewStateMachineF=False)
        return result

    identifier = read_option_start(fh)
    if identifier == None: return False

    verify_word_in_list(identifier, lexer_mode.mode_option_info_db.keys(),
                        "mode option", fh.name, get_current_line_info_number(fh))

    if identifier == "skip":
        # A skipper 'eats' characters at the beginning of a pattern that belong
        # to a specified set of characters. A useful application is most probably
        # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to
        # implement a very effective way to skip these regions.
        pattern_str, trigger_set = regular_expression.parse_character_set(fh, PatternStringF=True)
        skip_whitespace(fh)

        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'." % identifier, fh)

        if trigger_set.is_empty():
            error_msg("Empty trigger set for skipper." % identifier, fh)

        # TriggerSet skipping is implemented the following way: As soon as one element of the 
        # trigger set appears, the state machine enters the 'trigger set skipper section'.
        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)'
        pattern_sm  = StateMachine()
        pattern_sm.add_transition(pattern_sm.init_state_index, trigger_set, AcceptanceF=True)

        # Skipper code is to be generated later
        action = GeneratedCode(skip_character_set.do, 
                               FileName = fh.name, 
                               LineN    = get_current_line_info_number(fh))
        action.data["character_set"] = trigger_set

        pattern_sm = fit_state_machine(pattern_sm)
        # For skippers line and column counting detection is not really a topic
        # It is done in the skipper itself.
        pattern_sm.side_info = SideInfo()

        new_mode.add_match(pattern_str, action, pattern_sm)

        return True

    elif identifier in ["skip_range", "skip_nested_range"]:
        # A non-nesting skipper can contain a full fledged regular expression as opener,
        # since it only effects the trigger. Not so the nested range skipper-see below.

        # -- opener
        skip_whitespace(fh)
        if identifier == "skip_nested_range":
            # Nested range state machines only accept 'strings' not state machines
            opener_str, opener_sequence = parse_string_constant(fh, "Opener pattern for 'skip_nested_range'")
            
            opener_sm = StateMachine()
            idx = opener_sm.init_state_index
            for letter in opener_sequence:
                idx = opener_sm.add_transition(idx, letter)
            opener_sm.states[idx].set_acceptance(True)
        else:
            opener_str, opener_sm = regular_expression.parse(fh)
            # For 'range skipping' the opener sequence is not needed, only the opener state
            # machine is webbed into the pattern matching state machine.
            opener_sequence       = None

        skip_whitespace(fh)

        # -- closer
        closer_str, closer_sequence = parse_string_constant(fh, "Closing pattern for 'skip_range' or 'skip_nested_range'")
        skip_whitespace(fh)
        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'" % identifier, fh)

        # Skipper code is to be generated later
        generator_function = { 
                "skip_range":        skip_range.do,
                "skip_nested_range": skip_nested_range.do,
        }[identifier]
        action = GeneratedCode(generator_function,
                               FileName = fh.name, 
                               LineN    = get_current_line_info_number(fh))

        action.data["opener_sequence"] = opener_sequence
        action.data["closer_sequence"] = closer_sequence
        action.data["mode_name"]       = new_mode.name

        fit_state_machine(opener_sm)

        # For skippers line and column counting detection is not really a topic
        # It is done in the skipper itself.
        opener_sm.side_info = SideInfo()

        new_mode.add_match(opener_str, action, opener_sm)

        return True
        
    elif identifier == "indentation":
        value = indentation_setup.do(fh)

        # Enter 'Newline' and 'Suppressed Newline' as matches into the engine.
        # Similar to skippers, the indentation count is then triggered by the newline.
        # -- Suppressed Newline = Suppressor followed by Newline,
        #    then newline does not trigger indentation counting.
        suppressed_newline_pattern = ""
        if value.newline_suppressor_state_machine.get() != None:
            suppressed_newline_pattern = \
                  "(" + value.newline_suppressor_state_machine.pattern_str + ")" \
                + "(" + value.newline_state_machine.pattern_str + ")"
                                           
            suppressed_newline_sm = \
                sequentialize.do([value.newline_suppressor_state_machine.get(),
                                  value.newline_state_machine.get()])
                 
            FileName = value.newline_suppressor_state_machine.file_name
            LineN    = value.newline_suppressor_state_machine.line_n
            # Go back to start.
            code_fragment = UserCodeFragment("goto %s;" % get_label("$start", U=True), FileName, LineN)

            suppressed_newline_sm = fit_state_machine(suppressed_newline_sm)

            # Analyze pattern for constant number of newlines, characters, etc.
            suppressed_newline_sm.side_info = SideInfo(
                    character_counter.get_newline_n(suppressed_newline_sm),
                    character_counter.get_character_n(suppressed_newline_sm))

            new_mode.add_match(suppressed_newline_pattern, code_fragment, suppressed_newline_sm,
                               Comment="indentation newline suppressor")

        # When there is an empty line, then there shall be no indentation count on it.
        # Here comes the trick: 
        #
        #      Let               newline         
        #      be defined as:    newline ([space]* newline])*
        # 
        # This way empty lines are eating away before the indentation count is activated.

        # -- 'space'
        x0 = StateMachine()
        x0.add_transition(x0.init_state_index, value.indentation_count_character_set(), 
                          AcceptanceF=True)
        # -- '[space]*'
        x1 = repeat.do(x0)
        # -- '[space]* newline'
        x2 = sequentialize.do([x1, value.newline_state_machine.get()])
        # -- '([space]* newline)*'
        x3 = repeat.do(x2)
        # -- 'newline ([space]* newline)*'
        x4 = sequentialize.do([value.newline_state_machine.get(), x3])
        # -- nfa to dfa; hopcroft optimization
        sm = hopcroft.do(nfa_to_dfa.do(x4), CreateNewStateMachineF=False)

        FileName = value.newline_state_machine.file_name
        LineN    = value.newline_state_machine.line_n
        action   = GeneratedCode(indentation_counter.do, FileName, LineN)

        action.data["indentation_setup"] = value

        sm = fit_state_machine(sm)
        sm.side_info = SideInfo(character_counter.get_newline_n(sm),
                                character_counter.get_character_n(sm))
        new_mode.add_match(value.newline_state_machine.pattern_str,
                           action, sm, Comment="indentation newline")

        # Announce the mode to which the setup belongs
        value.set_containing_mode_name(new_mode.name)
    else:
        value = read_option_value(fh)

    # The 'verify_word_in_list()' call must have ensured that the following holds
    assert lexer_mode.mode_option_info_db.has_key(identifier)

    # Is the option of the appropriate value?
    option_info = lexer_mode.mode_option_info_db[identifier]
    if option_info.domain != None and value not in option_info.domain:
        error_msg("Tried to set value '%s' for option '%s'. " % (Value, Option) + \
                  "Though, possible for this option are only: %s." % repr(oi.domain)[1:-1], fh)

    # Finally, set the option
    new_mode.add_option(identifier, value)

    return True
Ejemplo n.º 19
0
def get_combined_state_machine(StateMachine_List, FilterDominatedOriginsF=True):
    """Creates a DFA state machine that incorporates the paralell
       process of all pattern passed as state machines in 
       the StateMachine_List. Each origins of each state machine
       are kept in the final state, if it is not dominated.

       Performs: -- parallelization
                 -- translation from NFA to DFA
                 -- Frank Schaefers Adapted Hopcroft optimization.

       Again: The state machine ids of the original state machines
              are traced through the whole process.
              
       FilterDominatedOriginsF, if set to False, can disable the filtering
              of dominated origins. This is important for pre-conditions, because,
              all successful patterns need to be reported!            
                      
    """   
    def __check(Place, sm):
        __check_on_orphan_states(Place, sm)
        __check_on_init_state_not_acceptance(Place, sm)

    def __check_on_orphan_states(Place, sm):
        orphan_state_list = sm.get_orphaned_state_index_list()
        if orphan_state_list == []: return
        error_msg("After '%s'" % Place + "\n" + \
                  "Orphaned state(s) detected in regular expression (optimization lack).\n" + \
                  "Please, log a defect at the projects website quex.sourceforge.net.\n"    + \
                  "Orphan state(s) = " + repr(orphan_state_list)                       + "\n") 

    def __check_on_init_state_not_acceptance(Place, sm):
        init_state = sm.get_init_state()
        if init_state.core().is_acceptance():
            error_msg("After '%s'" % Place + "\n" + \
                      "The initial state is 'acceptance'. This should never appear.\n" + \
                      "Please, log a defect at the projects website quex.sourceforge.net.\n")

        if filter(lambda origin: origin.is_acceptance(), init_state.origins().get_list()) != []:
            error_msg("After '%s'" % Place + "\n" + \
                      "Initial state contains an origin that is 'acceptance'. This should never appear.\n" + \
                      "Please, log a defect at the projects website quex.sourceforge.net.\n")

    # (1) mark at each state machine the machine and states as 'original'.
    #      
    #     This is necessary to trace in the combined state machine the
    #     pattern that actually matched. Note, that a state machine in
    #     the StateMachine_List represents one possible pattern that can
    #     match the current input.   
    #
    map(lambda x: x.mark_state_origins(), StateMachine_List)

    for sm in StateMachine_List:
        assert sm.is_DFA_compliant(), repr(sm)
    
    # (2) setup all patterns in paralell 
    sm = parallelize.do(StateMachine_List, CommonTerminalStateF=False)#, CloneF=False)
    __check("Parallelization", sm)

    # (3) convert the state machine to an DFA (paralellization created an NFA)
    sm = nfa_to_dfa.do(sm)
    __check("NFA to DFA", sm)

    # (4) determine for each state in the DFA what is the dominating original state
    if FilterDominatedOriginsF: sm.filter_dominated_origins()
    __check("Filter Dominated Origins", sm)

    # (5) perform hopcroft optimization
    #     Note, that hopcroft optimization does consider the original acceptance 
    #     states when deciding if two state sets are equivalent.   
    sm = hopcroft.do(sm)
    __check("Hopcroft Minimization", sm)

    return sm
Ejemplo n.º 20
0
def parse_mode_option(fh, new_mode):
    LanguageDB = Setup.language_db

    def fit_state_machine(SM):
        if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM)
        else: result = SM
        result = hopcroft.do(result, CreateNewStateMachineF=False)
        return result

    identifier = read_option_start(fh)
    if identifier == None: return False

    verify_word_in_list(identifier, lexer_mode.mode_option_info_db.keys(),
                        "mode option", fh.name,
                        get_current_line_info_number(fh))

    if identifier == "skip":
        # A skipper 'eats' characters at the beginning of a pattern that belong
        # to a specified set of characters. A useful application is most probably
        # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to
        # implement a very effective way to skip these regions.
        pattern_str, trigger_set = regular_expression.parse_character_set(
            fh, PatternStringF=True)
        skip_whitespace(fh)

        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'." % identifier,
                      fh)

        if trigger_set.is_empty():
            error_msg("Empty trigger set for skipper." % identifier, fh)

        # TriggerSet skipping is implemented the following way: As soon as one element of the
        # trigger set appears, the state machine enters the 'trigger set skipper section'.
        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)'
        pattern_sm = StateMachine()
        pattern_sm.add_transition(pattern_sm.init_state_index,
                                  trigger_set,
                                  AcceptanceF=True)

        # Skipper code is to be generated later
        action = GeneratedCode(skip_character_set.do,
                               FileName=fh.name,
                               LineN=get_current_line_info_number(fh))
        action.data["character_set"] = trigger_set

        pattern_sm = fit_state_machine(pattern_sm)
        # For skippers line and column counting detection is not really a topic
        # It is done in the skipper itself.
        pattern_sm.side_info = SideInfo()

        new_mode.add_match(pattern_str, action, pattern_sm)

        return True

    elif identifier in ["skip_range", "skip_nested_range"]:
        # A non-nesting skipper can contain a full fledged regular expression as opener,
        # since it only effects the trigger. Not so the nested range skipper-see below.

        # -- opener
        skip_whitespace(fh)
        if identifier == "skip_nested_range":
            # Nested range state machines only accept 'strings' not state machines
            opener_str, opener_sequence = parse_string_constant(
                fh, "Opener pattern for 'skip_nested_range'")

            opener_sm = StateMachine()
            idx = opener_sm.init_state_index
            for letter in opener_sequence:
                idx = opener_sm.add_transition(idx, letter)
            opener_sm.states[idx].set_acceptance(True)
        else:
            opener_str, opener_sm = regular_expression.parse(fh)
            # For 'range skipping' the opener sequence is not needed, only the opener state
            # machine is webbed into the pattern matching state machine.
            opener_sequence = None

        skip_whitespace(fh)

        # -- closer
        closer_str, closer_sequence = parse_string_constant(
            fh, "Closing pattern for 'skip_range' or 'skip_nested_range'")
        skip_whitespace(fh)
        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'" % identifier,
                      fh)

        # Skipper code is to be generated later
        generator_function = {
            "skip_range": skip_range.do,
            "skip_nested_range": skip_nested_range.do,
        }[identifier]
        action = GeneratedCode(generator_function,
                               FileName=fh.name,
                               LineN=get_current_line_info_number(fh))

        action.data["opener_sequence"] = opener_sequence
        action.data["closer_sequence"] = closer_sequence
        action.data["mode_name"] = new_mode.name

        fit_state_machine(opener_sm)

        # For skippers line and column counting detection is not really a topic
        # It is done in the skipper itself.
        opener_sm.side_info = SideInfo()

        new_mode.add_match(opener_str, action, opener_sm)

        return True

    elif identifier == "indentation":
        value = indentation_setup.do(fh)

        # Enter 'Newline' and 'Suppressed Newline' as matches into the engine.
        # Similar to skippers, the indentation count is then triggered by the newline.
        # -- Suppressed Newline = Suppressor followed by Newline,
        #    then newline does not trigger indentation counting.
        suppressed_newline_pattern = ""
        if value.newline_suppressor_state_machine.get() != None:
            suppressed_newline_pattern = \
                  "(" + value.newline_suppressor_state_machine.pattern_str + ")" \
                + "(" + value.newline_state_machine.pattern_str + ")"

            suppressed_newline_sm = \
                sequentialize.do([value.newline_suppressor_state_machine.get(),
                                  value.newline_state_machine.get()])

            FileName = value.newline_suppressor_state_machine.file_name
            LineN = value.newline_suppressor_state_machine.line_n
            # Go back to start.
            code_fragment = UserCodeFragment(
                "goto %s;" % get_label("$start", U=True), FileName, LineN)

            suppressed_newline_sm = fit_state_machine(suppressed_newline_sm)

            # Analyze pattern for constant number of newlines, characters, etc.
            suppressed_newline_sm.side_info = SideInfo(
                character_counter.get_newline_n(suppressed_newline_sm),
                character_counter.get_character_n(suppressed_newline_sm))

            new_mode.add_match(suppressed_newline_pattern,
                               code_fragment,
                               suppressed_newline_sm,
                               Comment="indentation newline suppressor")

        # When there is an empty line, then there shall be no indentation count on it.
        # Here comes the trick:
        #
        #      Let               newline
        #      be defined as:    newline ([space]* newline])*
        #
        # This way empty lines are eating away before the indentation count is activated.

        # -- 'space'
        x0 = StateMachine()
        x0.add_transition(x0.init_state_index,
                          value.indentation_count_character_set(),
                          AcceptanceF=True)
        # -- '[space]*'
        x1 = repeat.do(x0)
        # -- '[space]* newline'
        x2 = sequentialize.do([x1, value.newline_state_machine.get()])
        # -- '([space]* newline)*'
        x3 = repeat.do(x2)
        # -- 'newline ([space]* newline)*'
        x4 = sequentialize.do([value.newline_state_machine.get(), x3])
        # -- nfa to dfa; hopcroft optimization
        sm = hopcroft.do(nfa_to_dfa.do(x4), CreateNewStateMachineF=False)

        FileName = value.newline_state_machine.file_name
        LineN = value.newline_state_machine.line_n
        action = GeneratedCode(indentation_counter.do, FileName, LineN)

        action.data["indentation_setup"] = value

        sm = fit_state_machine(sm)
        sm.side_info = SideInfo(character_counter.get_newline_n(sm),
                                character_counter.get_character_n(sm))
        new_mode.add_match(value.newline_state_machine.pattern_str,
                           action,
                           sm,
                           Comment="indentation newline")

        # Announce the mode to which the setup belongs
        value.set_containing_mode_name(new_mode.name)
    else:
        value = read_option_value(fh)

    # The 'verify_word_in_list()' call must have ensured that the following holds
    assert lexer_mode.mode_option_info_db.has_key(identifier)

    # Is the option of the appropriate value?
    option_info = lexer_mode.mode_option_info_db[identifier]
    if option_info.domain != None and value not in option_info.domain:
        error_msg("Tried to set value '%s' for option '%s'. " % (Value, Option) + \
                  "Though, possible for this option are only: %s." % repr(oi.domain)[1:-1], fh)

    # Finally, set the option
    new_mode.add_option(identifier, value)

    return True
Ejemplo n.º 21
0
 def fit_state_machine(SM):
     if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM)
     else: result = SM
     result = hopcroft.do(result, CreateNewStateMachineF=False)
     return result