Esempio n. 1
0
 def __add_case_fold(sm, Flags, trigger_set, start_state_idx, target_state_idx):
     for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
         for i in range(interval.begin, interval.end):
             fold = ucs_case_fold.get_fold_set(i, Flags)
             for x in fold:
                 if type(x) == list:
                     __add_intermediate_states(sm, x, start_state_idx, target_state_idx)
                 else:
                     trigger_set.add_interval(Interval(x, x+1))
Esempio n. 2
0
def  snap_case_folded_pattern(sh, PatternDict, NumberSetF=False):
    """Parse a case fold expression of the form \C(..){ R } or \C{ R }.
       Assume that '\C' has been snapped already from the stream.

       See function ucs_case_fold_parser.get_fold_set() for details
       about case folding.
    """
    def __add_intermediate_states(sm, character_list, start_state_idx, target_state_idx):
        next_idx = start_state_idx
        for letter in character_list[:-1]:
            next_idx = sm.add_transition(next_idx, letter)
        sm.add_transition(next_idx, character_list[-1], target_state_idx)

    def __add_case_fold(sm, Flags, trigger_set, start_state_idx, target_state_idx):
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, Flags)
                for x in fold:
                    if type(x) == list:
                        __add_intermediate_states(sm, x, start_state_idx, target_state_idx)
                    else:
                        trigger_set.add_interval(Interval(x, x+1))


    pos = sh.tell()
    skip_whitespace(sh)
    # -- parse the optional options in '(' ')' brackets
    if not check(sh, "("):
        # By default 'single' and 'multi' character case folds are active
        if NumberSetF: flag_txt = "s"
        else:          flag_txt = "sm"
    else:
        flag_txt = read_until_character(sh, ")")

        if flag_txt == "":
            sh.seek(pos)
            error_msg("Missing closing ')' in case fold expression.", sh)

        flag_txt = flag_txt.replace(" ", "").replace("\t", "").replace("\n", "")

        for letter in flag_txt:
            if letter not in "smt":
                sh.seek(pos)
                error_msg("Letter '%s' not permitted as case fold option.\n" % letter + \
                          "Options are:  's' for simple case fold.\n" + \
                          "              'm' for multi character sequence case fold.\n" + \
                          "              't' for special turkish case fold rules.", sh)

            if NumberSetF and letter == "m":
                sh.seek(pos)
                error_msg("Option 'm' not permitted as case fold option in set expression.\n" + \
                          "Set expressions cannot absorb multi character sequences.", sh)

        skip_whitespace(sh)


    result = snap_curly_bracketed_expression(sh, PatternDict, "case fold operator", "C")[0]
    if NumberSetF:
        trigger_set = result.get_number_set()
        if trigger_set is None:
            error_msg("Expression in case fold does not result in character set.\n" + 
                      "The content in '\\C{content}' may start with '[' or '[:'.", sh)

        # -- perform the case fold for Sets!
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, flag_txt)
                for x in fold:
                    assert type(x) != list
                    trigger_set.add_interval(Interval(x, x+1))

        result = trigger_set

    else:
        # -- perform the case fold for State Machines!
        for state_idx, state in result.states.items():
            for target_state_idx, trigger_set in state.target_map.get_map().items():
                __add_case_fold(result, flag_txt, trigger_set, state_idx, target_state_idx)

    return result
Esempio n. 3
0
def do(sh, PatternDict, snap_expression=None, snap_set_expression=None):
    """Parse a case fold expression of the form \C(..){ R } or \C{ R }.
       Assume that '\C' has been snapped already from the stream.

       See function ucs_case_fold_parser.get_fold_set() for details
       about case folding.

       snap_expression is not None, then snap_expression is the function 
                                to parse a RE and the caller
                                expects a state machine.

       snap_set_expression is not None, then snap_set_expression is the
                                    function to parse a character 
                                    set and caller expects a 
                                    NumberSet object.
    """

    pos = sh.tell()
    skip_whitespace(sh)
    # -- parse the optional options in '(' ')' brackets
    if not check(sh, "("):
        # By default 'single' and 'multi' character case folds are active
        if snap_set_expression is not None: flag_txt = "s"
        else: flag_txt = "sm"

    else:
        flag_txt = read_until_character(sh, ")")

        if flag_txt == "":
            sh.seek(pos)
            error_msg("Missing closing ')' in case fold expression.", sh)

        flag_txt = flag_txt.replace(" ", "").replace("\t",
                                                     "").replace("\n", "")

        for letter in flag_txt:
            if letter not in "smt":
                sh.seek(pos)
                error_msg("Letter '%s' not permitted as case fold option.\n" % letter + \
                          "Options are:  's' for simple case fold.\n" + \
                          "              'm' for multi character sequence case fold.\n" + \
                          "              't' for special turkish case fold rules.", sh)

            if snap_set_expression is not None and letter == "m":
                sh.seek(pos)
                error_msg("Option 'm' not permitted as case fold option in set expression.\n" + \
                          "Set expressions cannot absorb multi character sequences.", sh)

        skip_whitespace(sh)

    # -- parse the expression in '{' '}' which is subject to case folding
    if not check(sh, "{"):
        sh.seek(pos)
        error_msg("Missing '{' for case fold expression.", sh)

    skip_whitespace(sh)
    if snap_set_expression is not None:
        trigger_set = snap_set_expression(sh, PatternDict)
        if trigger_set is None:
            error_msg(
                "Missing character set for case fold in set expression.\n" +
                "The content in '\\C{content}' should start with '[' or '[:'.",
                sh)

        # -- perform the case fold for Sets!
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, flag_txt)
                for x in fold:
                    assert type(x) != list
                    trigger_set.add_interval(Interval(x, x + 1))

        result = trigger_set

    else:
        sm = snap_expression(sh, PatternDict)
        if sm is None:
            error_msg(
                "Missing expression for case fold '\C'.\n" +
                "The content in '\\C{content}' should start with '[' or '[:'.",
                sh)

        # -- perform the case fold for State Machines!
        for state_idx, state in sm.states.items():
            transitions = state.transitions()
            for target_state_idx, trigger_set in transitions.get_map().items():
                __add_case_fold(sm, flag_txt, trigger_set, state_idx,
                                target_state_idx)

        result = sm

    if not check(sh, "}"):
        sh.seek(pos)
        error_msg("Missing '}' for case fold expression.", sh)

    return result