Example #1
0
def get_sm_list(FSM0, FSM1, FSM2):
    # SPECIALITIES: -- sm0 and sm1 have an intersection between their second
    #                  transition.
    #               -- sm1 transits further upon acceptance.
    #               -- sm2 has only one transition.
    # Generate DFA that does not have any intersection with
    # the loop transitions.
    sm0 = DFA()
    si = sm0.add_transition(sm0.init_state_index, FSM0)
    si = sm0.add_transition(si, NS_A, AcceptanceF=True)
    sm0.states[si].mark_acceptance_id(dial.new_incidence_id())

    sm1 = DFA()
    si0 = sm1.add_transition(sm1.init_state_index, FSM1)
    si = sm1.add_transition(si0, NS_A, AcceptanceF=True)
    iid1 = dial.new_incidence_id()
    sm1.states[si].mark_acceptance_id(iid1)
    si = sm1.add_transition(si, NS_B, si0)
    sm1.states[si].mark_acceptance_id(iid1)

    sm2 = DFA()
    si = sm2.add_transition(sm2.init_state_index, FSM2, AcceptanceF=True)
    sm2.states[si].mark_acceptance_id(dial.new_incidence_id())

    return [sm0, sm1, sm2]
Example #2
0
def test(TestString):
    print "expression    = \"" + TestString + "\""
    sm = DFA()
    try:
        trigger_set = character_set.do(StringIO(TestString + "]"))
        sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True)
        print "state machine\n", sm
    except RegularExpressionException, x:
        print repr(x)
Example #3
0
def snap_non_control_character(stream, PatternDict):
    __debug_entry("non-control characters", stream)

    # (*) read first character
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    if char_code is None:
        error.log(
            "Character could not be interpreted as UTF8 code or End of File reached prematurely.",
            stream)
    result = DFA()
    result.add_transition(result.init_state_index, char_code, AcceptanceF=True)
    return __debug_exit(result, stream)
Example #4
0
def test_on_UCS_range(Trafo, Source, Drain, CharacterBackwardTrafo):
    sm = DFA()
    acc_db = {}
    for x in range(Source.begin, Source.end):
        ti = sm.add_transition(sm.init_state_index, x, AcceptanceF=True)
        acc_id = len(acc_db)
        sm.states[ti].mark_acceptance_id(acc_id)
        acc_db[x] = acc_id

    if Setup.bad_lexatom_detection_f:
        acc_db[None] = E_IncidenceIDs.BAD_LEXATOM
    else:
        acc_db[None] = None

    state_n_before, result = transform(Trafo, sm)
    # assert state_n_before == len(result.states)

    init_state = result.get_init_state()
    count = 0
    for y in range(Drain.begin, Drain.end):
        # Translate character into
        x = CharacterBackwardTrafo(y)
        # Transit on the translated charater
        ti = init_state.target_map.get_resulting_target_state_index(y)
        # Compare resulting state with the expected state's acceptance
        assert_only_acceptance_id(sm.states, ti, acc_db, x, y)

        count += 1

    print "<terminated: %i transitions ok>" % count
Example #5
0
def do(sh):
    """Converts a uni-code string into a state machine that parses 
       its letters sequentially. Each state in the sequence correponds
       to the sucessful triggering of a letter. Only the last state, though,
       is an acceptance state. Any bailing out before is 'not accepted'. 
       Example:

       "hey" is translated into the state machine:

           (0)-- 'h' -->(1)-- 'e' -->(2)-- 'y' --> ACCEPTANCE
            |            |            |
           FAIL         FAIL         FAIL
    
      Note: The state indices are globally unique. But, they are not necessarily
            0, 1, 2, ... 
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    # resulting state machine
    result = DFA()
    state_idx = result.init_state_index

    # Only \" is a special character '"', any other backslashed character
    # remains as the sequence 'backslash' + character
    for char_code in get_character_code_sequence(sh):
        state_idx = result.add_transition(state_idx, char_code)

    # when the last state has trigger it is supposed to end up in 'acceptance'
    result.states[state_idx].set_acceptance()
    return result
Example #6
0
def create_ALL_BUT_NEWLINE_state_machine(stream):
    global Setup
    result = DFA()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n"))).get_complement(
        Setup.buffer_encoding.source_set)
    if trigger_set.is_empty():
        error.log(
            "The set of admissible characters contains only newline.\n"
            "The '.' for 'all but newline' is an empty set.",
            SourceRef.from_FileHandle(stream))

    result.add_transition(result.init_state_index,
                          trigger_set,
                          AcceptanceF=True)
    return result
Example #7
0
def snap_character_set_expression(stream, PatternDict):
    # GRAMMAR:
    #
    # set_expression:
    #                 [: set_term :]
    #                 traditional character set
    #                 \P '{' propperty string '}'
    #                 '{' identifier '}'
    #
    # set_term:
    #                 "alnum"
    #                 "alpha"
    #                 "blank"
    #                 "cntrl"
    #                 "digit"
    #                 "graph"
    #                 "lower"
    #                 "print"
    #                 "punct"
    #                 "space"
    #                 "upper"
    #                 "xdigit"
    #                 "union"        '(' set_term [ ',' set_term ]+ ')'
    #                 "intersection" '(' set_term [ ',' set_term ]+ ')'
    #                 "difference"   '(' set_term [ ',' set_term ]+ ')'
    #                 "complement"   '(' set_term ')'
    #                 set_expression
    #
    trigger_set = snap_set_expression(stream, PatternDict)

    if trigger_set is None:
        error.log("Regular Expression: snap_character_set_expression called for something\n" + \
                  "that does not start with '[:', '[' or '\\P'", stream)
    elif trigger_set.is_empty():
        error.warning(
            "Regular Expression: Character set expression results in empty set.",
            stream)

    # Create state machine that triggers with the trigger set to SUCCESS
    # NOTE: The default for the ELSE transition is FAIL.
    sm = DFA()
    sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True)

    return __debug_exit(sm, stream)
Example #8
0
def do(SM_List):
    for sm in SM_List:
        sm.assert_consistency() 

    if any(sm.is_Empty() for sm in SM_List): # If one state machine is '\Empty',
        return DFA.Empty()                   # then the intersection is '\Empty'.

    init_state_setup = tuple(sm.init_state_index for sm in SM_List)
    result           = DFA(AcceptanceF=intersect_acceptance(init_state_setup, SM_List))

    # Result state setup: A result state is setup out of a state from each DFA.
    #                     state_setup[i] is the state from DFA 'SM_List[i]'.
    worklist       = [ (result.init_state_index, init_state_setup) ]
    state_setup_db = {}
    N              = len(SM_List)
    while worklist:
        state_index, state_setup = worklist.pop()

        # Generate Map that shows what lexatoms trigger to what state combination.
        #
        #       NumberSet    Target DFA_State Combination 
        #       [0:23]   --> [ State1, State24, State56 ]
        #       [0:23]   --> [ State5, State21, State55 ]
        #       [24:60]  --> [ State1, State23, State51 ]
        #
        # 'get_intersection_line_up()' only delivers those transitions where there
        # is a transition for each state machine's state.
        line_up = get_intersection_line_up([SM_List[i].states[si].target_map
                                            for i, si in enumerate(state_setup)])
        for target_state_setup, trigger_set in line_up.iteritems():
            assert len(target_state_setup) == N
            target_index, new_f = state_index_for_combination(state_setup_db,
                                                              target_state_setup)

            acceptance_f = intersect_acceptance(target_state_setup, SM_List)
            result.add_transition(state_index, trigger_set, target_index,
                                  AcceptanceF = acceptance_f)

            if new_f:
                worklist.append((target_index, target_state_setup))

    result.delete_hopeless_states()
    return result
Example #9
0
class X:
    def __init__(self, Name):
        sh = StringIO("[:\\P{Script=%s}:]" % Name)
        self.name = Name
        self.charset = regex.snap_set_expression(sh, {})
        self.sm = DFA()
        self.sm.add_transition(self.sm.init_state_index,
                               self.charset,
                               AcceptanceF=True)
        self.id = self.sm.get_id()

    def check(self, SM, TransformFunc):
        """This function throws an exception as soon as one single value
           is not matched according to the expectation.
        """
        print "## [%i] Name = %s" % (self.id, self.name),
        interval_list = self.charset.get_intervals(PromiseToTreatWellF=True)
        interval_count = len(interval_list)
        for interval in interval_list:
            for i in range(interval.begin, interval.end):
                lexatom_seq = TransformFunc(i)

                # Apply sequence to state machine
                state = SM.apply_sequence(lexatom_seq)
                if state is None:
                    error(self.sm, SM, lexatom_seq)

                # All acceptance flags must belong to the original state machine
                acceptance_id_list = [
                    cmd.acceptance_id()
                    for cmd in state.single_entry.get_iterable(SeAccept)
                ]
                if acceptance_id_list and self.id not in acceptance_id_list:
                    print eval("u'\U%08X'" % i)
                    print "#Seq:  ", ["%02X" % x for x in lexatom_seq]
                    print "#acceptance-ids:", acceptance_id_list
                    error(self.sm, SM, lexatom_seq)

        print " (OK=%i)" % interval_count
Example #10
0
def DFA_Newline():
    """Creates a state machine matching newline according to what has been 
    specified in the setup (Setup.dos_carriage_return_newline_f). 

    That is, if is DOS newline then the state machine represents '\r\n' and
    if it is unix only, then it represents '\n'. If both is required they 
    are implemented in parallel.

    RETURNS: DFA
    """
    UnixF = True
    DosF = Setup.dos_carriage_return_newline_f

    NL = ord('\n')  # (pure) newline, i.e. line feed
    CR = ord('\r')  # carriage return

    dfa = DFA()
    if UnixF:
        dfa.add_transition(dfa.init_state_index, NL, AcceptanceF=True)
    if DosF:
        idx = dfa.add_transition(dfa.init_state_index, CR, AcceptanceF=False)
        dfa.add_transition(idx, NL, AcceptanceF=True)

    return beautifier.do(dfa)
Example #11
0
    ca_list = get_ca_list(0x10, 0x60)
    sm_list = get_sm_list(NumberSet.from_range(0x10, 0x40),
                          NumberSet.from_range(0x20, 0x50),
                          NumberSet.from_range(0x30, 0x60))

    # Test for each 'sm' in 'sm_list' is superfluous.
    # It is done in 'AppendixNoI'.
    test(ca_list, sm_list)

elif "Split" in sys.argv:
    # A first transition of a state machine is separated into two, because
    # it is covered by more than one different count action.
    NS1 = NumberSet.from_range(0x10, 0x20)
    NS2 = NumberSet.from_range(0x20, 0x30)
    NS3 = NumberSet.from_range(0x30, 0x40)
    NS4 = NumberSet.from_range(0x40, 0x50)
    ca_list = [
        (NS1, CountAction(E_CharacterCountType.COLUMN, 1)),
        (NS2, CountAction(E_CharacterCountType.COLUMN, 2)),
        (NS3, CountAction(E_CharacterCountType.COLUMN, 3)),
        (NS4, CountAction(E_CharacterCountType.COLUMN, 4)),
    ]

    sm = DFA()
    si = sm.init_state_index
    iid = dial.new_incidence_id()
    ti0 = sm.add_transition(si, NumberSet.from_range(0x1A, 0x4B))
    ac0 = sm.add_transition(ti0, NS_A, AcceptanceF=True)

    test(ca_list, [sm])
Example #12
0
def snap_primary(stream, PatternDict):
    """primary:  " non_double_quote *  "              = character string
                 [ non_rect_bracket_close ]           = set of characters
                 { identifier }                       = pattern replacement
                 ( expression )
                 non_control_character+               = lonely characters
                 primary repetition_cmd
    """
    global SPECIAL_TERMINATOR

    __debug_entry("primary", stream)
    x = stream.read(1)
    if x == "": return __debug_exit(None, stream)

    # -- 'primary' primary
    if x == "\"": result = snap_character_string.do(stream)
    elif x == "[":
        stream.seek(-1, 1)
        result = snap_character_set_expression(stream, PatternDict)
    elif x == "{":
        result = snap_replacement(stream, PatternDict)
    elif x == ".":
        result = create_ALL_BUT_NEWLINE_state_machine(stream)
    elif x == "(":
        result = snap_bracketed_expression(stream, PatternDict)

    elif x.isspace():
        # a lonestanding space ends the regular expression
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    elif x in ["*", "+", "?"]:
        raise RegularExpressionException(
            "lonely operator '%s' without expression proceeding." % x)

    elif x == "\\":
        result = snap_command(stream, PatternDict)
        if result is None:
            stream.seek(-1, 1)
            trigger_set = snap_property_set(stream)
            if trigger_set is None:
                # snap the '\'
                stream.read(1)
                char_code = snap_backslashed_character.do(stream)
                if char_code is None:
                    raise RegularExpressionException(
                        "Backslash followed by unrecognized character code.")
                trigger_set = char_code
            result = DFA()
            result.add_transition(result.init_state_index,
                                  trigger_set,
                                  AcceptanceF=True)

    elif x not in CONTROL_CHARACTERS and x != SPECIAL_TERMINATOR:
        # NOTE: The '\' is not inside the control characters---for a reason.
        #       It is used to define for example character codes using '\x' etc.
        stream.seek(-1, 1)
        result = snap_non_control_character(stream, PatternDict)

    else:
        # NOTE: This includes the '$' sign which means 'end of line'
        #       because the '$' sign is in CONTROL_CHARACTERS, but is not checked
        #       against. Thus, it it good to leave here on '$' because the
        #       '$' sign is handled on the very top level.
        # this is not a valid primary
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    # -- optional repetition command?
    result_repeated = __snap_repetition_range(result, stream)
    if result_repeated is not None: result = result_repeated

    return __debug_exit(result, stream)
Example #13
0
def test(A_txt, B_txt):
    """Performs: \CutBegin{P Q} and prints the result!

    """
    print "---------------------------"

    A, B = parse_REs(A_txt, B_txt)
    result = core(A, B)

    print
    print "result = ", result.get_string(NormalizeF=True)


if False:  # Selected Test
    dfa = DFA()
    dfa.add_transition(dfa.init_state_index, 1, AcceptanceF=True)

    set_unique_transition_f()

    # for name in get_sm_shape_names_list():
    if True:
        name = "DEBUG"
        Q = get_sm_shape_by_name_with_acceptance(name)
        print "# %s" % name, Q
        cut_Q_Q = __operation(Q, Q)
        # \CutBegin{Q          Q}          = \Empty
        print "#cut_Q_Q:", cut_Q_Q
        print "#verdict:", cut_Q_Q.is_Nothing()
        try:
            assert cut_Q_Q.is_Nothing()
        except: