Exemple #1
0
 def __add_case_fold(sm, Flags, trigger_set, start_state_idx, target_state_idx):
     for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
         for i in range(interval.begin, interval.end):
             fold = ucs_case_fold.get_fold_set(i, Flags)
             for x in fold:
                 if type(x) == list:
                     __add_intermediate_states(sm, x, start_state_idx, target_state_idx)
                 else:
                     trigger_set.add_interval(Interval(x, x+1))
Exemple #2
0
 def __add_case_fold(sm, Flags, trigger_set, start_state_idx,
                     target_state_idx):
     for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
         for i in range(interval.begin, interval.end):
             fold = ucs_case_fold.get_fold_set(i, Flags)
             for x in fold:
                 if type(x) == list:
                     __add_intermediate_states(sm, x, start_state_idx,
                                               target_state_idx)
                 else:
                     trigger_set.add_interval(Interval(x, x + 1))
Exemple #3
0
            txt += "(%04X)" % x
        else:
            for xe in x:
                txt += map_unicode_to_utf8(xe)
                txt += "(%04X)" % xe
        txt += ", "

    if len(txt) != 0: txt = txt[:-2]
    return txt

print "---------------------------------------------"

for letter in [u"A", u"I", u"İ", u"J", u"K", u"S", u"Ċ", u"Ø", u"É", u"Ω", u"Π"]:
    code = ord(letter)
    # result = letter + u" --> " + pump(parser.get_fold_set(code, flags)) + u"\n"
    print letter, " --> ", pump(parser.get_fold_set(code, flags))

for letter in [u"a", u"ı", u"i", u"j", u"k", u"s", u"ċ", u"ø", u"é", u"ω", u"π"]:
    code = ord(letter)
    print letter, " --> ", pump(parser.get_fold_set(code, flags))

print "---------------------------------------------"
letter_list = [ 
                u"a",
                u"ß", # LATIN SMALL LETTER SHARP S
                u"ΐ", # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
                u"ʼn", # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
                u"İ", # LATIN CAPITAL LETTER I WITH DOT ABOVE
                u"ff", # LATIN SMALL LIGATURE FF
                u"ffi", # LATIN SMALL LIGATURE FFI
                u"ﬗ",  # ARMENIAN SMALL LIGATURE MEN XEH
Exemple #4
0
                txt += map_unicode_to_utf8(xe)
                txt += "(%04X)" % xe
        txt += ", "

    if len(txt) != 0: txt = txt[:-2]
    return txt


print "---------------------------------------------"

for letter in [
        u"A", u"I", u"İ", u"J", u"K", u"S", u"Ċ", u"Ø", u"É", u"Ω", u"Π"
]:
    code = ord(letter)
    # result = letter + u" --> " + pump(parser.get_fold_set(code, flags)) + u"\n"
    print letter, " --> ", pump(parser.get_fold_set(code, flags))

for letter in [
        u"a", u"ı", u"i", u"j", u"k", u"s", u"ċ", u"ø", u"é", u"ω", u"π"
]:
    code = ord(letter)
    print letter, " --> ", pump(parser.get_fold_set(code, flags))

print "---------------------------------------------"
letter_list = [
    u"a",
    u"ß",  # LATIN SMALL LETTER SHARP S
    u"ΐ",  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
    u"ʼn",  # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
    u"İ",  # LATIN CAPITAL LETTER I WITH DOT ABOVE
    u"ff",  # LATIN SMALL LIGATURE FF
Exemple #5
0
def  snap_case_folded_pattern(sh, PatternDict, NumberSetF=False):
    """Parse a case fold expression of the form \C(..){ R } or \C{ R }.
       Assume that '\C' has been snapped already from the stream.

       See function ucs_case_fold_parser.get_fold_set() for details
       about case folding.
    """
    def __add_intermediate_states(sm, character_list, start_state_idx, target_state_idx):
        next_idx = start_state_idx
        for letter in character_list[:-1]:
            next_idx = sm.add_transition(next_idx, letter)
        sm.add_transition(next_idx, character_list[-1], target_state_idx)

    def __add_case_fold(sm, Flags, trigger_set, start_state_idx, target_state_idx):
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, Flags)
                for x in fold:
                    if type(x) == list:
                        __add_intermediate_states(sm, x, start_state_idx, target_state_idx)
                    else:
                        trigger_set.add_interval(Interval(x, x+1))


    pos = sh.tell()
    skip_whitespace(sh)
    # -- parse the optional options in '(' ')' brackets
    if not check(sh, "("):
        # By default 'single' and 'multi' character case folds are active
        if NumberSetF: flag_txt = "s"
        else:          flag_txt = "sm"
    else:
        flag_txt = read_until_character(sh, ")")

        if flag_txt == "":
            sh.seek(pos)
            error.log("Missing closing ')' in case fold expression.", sh)

        flag_txt = flag_txt.replace(" ", "").replace("\t", "").replace("\n", "")

        for letter in flag_txt:
            if letter not in "smt":
                sh.seek(pos)
                error.log("Letter '%s' not permitted as case fold option.\n" % letter + \
                          "Options are:  's' for simple case fold.\n" + \
                          "              'm' for multi character sequence case fold.\n" + \
                          "              't' for special turkish case fold rules.", sh)

            if NumberSetF and letter == "m":
                sh.seek(pos)
                error.log("Option 'm' not permitted as case fold option in set expression.\n" + \
                          "Set expressions cannot absorb multi character sequences.", sh)

        skip_whitespace(sh)


    result = snap_curly_bracketed_expression(sh, PatternDict, "case fold operator", "C")[0]
    if NumberSetF:
        trigger_set = result.get_number_set()
        if trigger_set is None:
            error.log("Expression in case fold does not result in character set.\n" + 
                      "The content in '\\C{content}' may start with '[' or '[:'.", sh)

        # -- perform the case fold for Sets!
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, flag_txt)
                for x in fold:
                    assert type(x) != list
                    trigger_set.add_interval(Interval(x, x+1))

        result = trigger_set

    else:
        # -- perform the case fold for State Machines!
        for state_idx, state in result.states.items():
            for target_state_idx, trigger_set in state.target_map.get_map().items():
                __add_case_fold(result, flag_txt, trigger_set, state_idx, target_state_idx)

    return result
Exemple #6
0
def snap_case_folded_pattern(sh, PatternDict, NumberSetF=False):
    """Parse a case fold expression of the form \C(..){ R } or \C{ R }.
       Assume that '\C' has been snapped already from the stream.

       See function ucs_case_fold_parser.get_fold_set() for details
       about case folding.
    """
    def __add_intermediate_states(sm, character_list, start_state_idx,
                                  target_state_idx):
        next_idx = start_state_idx
        for letter in character_list[:-1]:
            next_idx = sm.add_transition(next_idx, letter)
        sm.add_transition(next_idx, character_list[-1], target_state_idx)

    def __add_case_fold(sm, Flags, trigger_set, start_state_idx,
                        target_state_idx):
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, Flags)
                for x in fold:
                    if type(x) == list:
                        __add_intermediate_states(sm, x, start_state_idx,
                                                  target_state_idx)
                    else:
                        trigger_set.add_interval(Interval(x, x + 1))

    pos = sh.tell()
    skip_whitespace(sh)
    # -- parse the optional options in '(' ')' brackets
    if NumberSetF: default_flag_txt = "s"
    else: default_flag_txt = "sm"

    flag_txt = optional_flags(
        sh, "case fold", default_flag_txt, {
            "s": "simple case fold",
            "m": "multi character sequence case fold",
            "t": "special turkish case fold rules",
        }, [])

    if NumberSetF and "m" in flag_txt:
        sh.seek(pos)
        error.log("Option 'm' not permitted as case fold option in set expression.\n" + \
                  "Set expressions cannot absorb multi character sequences.", sh)

    skip_whitespace(sh)

    result = snap_curly_bracketed_expression(sh, PatternDict,
                                             "case fold operator", "C")[0]

    if NumberSetF:
        trigger_set = result.get_number_set()
        if trigger_set is None or trigger_set.is_empty():
            error.log(
                "Expression in case fold does not result in character set.\n" +
                "The content in '\\C{content}' may start with '[' or '[:'.",
                sh)

        # -- perform the case fold for Sets!
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, flag_txt)
                for x in fold:
                    assert type(x) != list
                    trigger_set.add_interval(Interval(x, x + 1))

        result = trigger_set

    else:
        # -- perform the case fold for DFAs!
        for state_idx, state in result.states.items():
            for target_state_idx, trigger_set in state.target_map.get_map(
            ).items():
                __add_case_fold(result, flag_txt, trigger_set, state_idx,
                                target_state_idx)

    return result
Exemple #7
0
def snap_case_folded_pattern(sh, PatternDict, NumberSetF=False):
    """Parse a case fold expression of the form \C(..){ R } or \C{ R }.
       Assume that '\C' has been snapped already from the stream.

       See function ucs_case_fold_parser.get_fold_set() for details
       about case folding.
    """
    def __add_intermediate_states(sm, character_list, start_state_idx,
                                  target_state_idx):
        next_idx = start_state_idx
        for letter in character_list[:-1]:
            next_idx = sm.add_transition(next_idx, letter)
        sm.add_transition(next_idx, character_list[-1], target_state_idx)

    def __add_case_fold(sm, Flags, trigger_set, start_state_idx,
                        target_state_idx):
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, Flags)
                for x in fold:
                    if type(x) == list:
                        __add_intermediate_states(sm, x, start_state_idx,
                                                  target_state_idx)
                    else:
                        trigger_set.add_interval(Interval(x, x + 1))

    pos = sh.tell()
    skip_whitespace(sh)
    # -- parse the optional options in '(' ')' brackets
    if not check(sh, "("):
        # By default 'single' and 'multi' character case folds are active
        if NumberSetF: flag_txt = "s"
        else: flag_txt = "sm"
    else:
        flag_txt = read_until_character(sh, ")")

        if flag_txt == "":
            sh.seek(pos)
            error.log("Missing closing ')' in case fold expression.", sh)

        flag_txt = flag_txt.replace(" ", "").replace("\t",
                                                     "").replace("\n", "")

        for letter in flag_txt:
            if letter not in "smt":
                sh.seek(pos)
                error.log("Letter '%s' not permitted as case fold option.\n" % letter + \
                          "Options are:  's' for simple case fold.\n" + \
                          "              'm' for multi character sequence case fold.\n" + \
                          "              't' for special turkish case fold rules.", sh)

            if NumberSetF and letter == "m":
                sh.seek(pos)
                error.log("Option 'm' not permitted as case fold option in set expression.\n" + \
                          "Set expressions cannot absorb multi character sequences.", sh)

        skip_whitespace(sh)

    result = snap_curly_bracketed_expression(sh, PatternDict,
                                             "case fold operator", "C")[0]
    if NumberSetF:
        trigger_set = result.get_number_set()
        if trigger_set is None:
            error.log(
                "Expression in case fold does not result in character set.\n" +
                "The content in '\\C{content}' may start with '[' or '[:'.",
                sh)

        # -- perform the case fold for Sets!
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, flag_txt)
                for x in fold:
                    assert type(x) != list
                    trigger_set.add_interval(Interval(x, x + 1))

        result = trigger_set

    else:
        # -- perform the case fold for State Machines!
        for state_idx, state in result.states.items():
            for target_state_idx, trigger_set in state.target_map.get_map(
            ).items():
                __add_case_fold(result, flag_txt, trigger_set, state_idx,
                                target_state_idx)

    return result