コード例 #1
0
def snap_property_set(stream):
    position = stream.tell()
    x = stream.read(2)
    if   x == "\\P": 
        stream.seek(position)
        return property.do(stream)
    elif x == "\\N": 
        stream.seek(position)
        return property.do_shortcut(stream, "N", "na") # UCS Property: Name
    elif x == "\\G": 
        stream.seek(position)
        return property.do_shortcut(stream, "G", "gc") # UCS Property: General_Category
    elif x == "\\E": 
        skip_whitespace(stream)
        if check(stream, "{") == False:
            error_msg("Missing '{' after '\\E'.", stream)
        encoding_name = __snap_until(stream, "}").strip()
        return codec_db.get_supported_unicode_character_set(encoding_name, stream)
    else:
        stream.seek(position)
        return None
コード例 #2
0
def snap_property_set(stream):
    position = stream.tell()
    x = stream.read(2)
    if x == "\\P":
        stream.seek(position)
        return property.do(stream)
    elif x == "\\N":
        stream.seek(position)
        return property.do_shortcut(stream, "N", "na")  # UCS Property: Name
    elif x == "\\G":
        stream.seek(position)
        return property.do_shortcut(stream, "G",
                                    "gc")  # UCS Property: General_Category
    elif x == "\\E":
        skip_whitespace(stream)
        if check(stream, "{") == False:
            error_msg("Missing '{' after '\\E'.", stream)
        encoding_name = __snap_until(stream, "}").strip()
        return codec_db.get_supported_unicode_character_set(
            encoding_name, stream)
    else:
        stream.seek(position)
        return None
コード例 #3
0
def do(sh, PatternDict, snap_expression=None, snap_set_expression=None):
    """Parse a case fold expression of the form \C(..){ R } or \C{ R }.
       Assume that '\C' has been snapped already from the stream.

       See function ucs_case_fold_parser.get_fold_set() for details
       about case folding.

       snap_expression != None, then snap_expression is the function 
                                to parse a RE and the caller
                                expects a state machine.

       snap_set_expression != None, then snap_set_expression is the
                                    function to parse a character 
                                    set and caller expects a 
                                    NumberSet object.
    """

    pos = sh.tell()
    skip_whitespace(sh)
    # -- parse the optional options in '(' ')' brackets
    if not check(sh, "("):
        # By default 'single' and 'multi' character case folds are active
        if snap_set_expression != None: flag_txt = "s"
        else: flag_txt = "sm"

    else:
        flag_txt = read_until_character(sh, ")")

        if flag_txt == "":
            sh.seek(pos)
            error_msg("Missing closing ')' in case fold expression.", sh)

        flag_txt = flag_txt.replace(" ", "").replace("\t",
                                                     "").replace("\n", "")

        for letter in flag_txt:
            if letter not in "smt":
                sh.seek(pos)
                error_msg("Letter '%s' not permitted as case fold option.\n" % letter + \
                          "Options are:  's' for simple case fold.\n" + \
                          "              'm' for multi character sequence case fold.\n" + \
                          "              't' for special turkish case fold rules.", sh)

            if snap_set_expression != None and letter == "m":
                sh.seek(pos)
                error_msg("Option 'm' not permitted as case fold option in set expression.\n" + \
                          "Set expressions cannot absorb multi character sequences.", sh)

        skip_whitespace(sh)

    # -- parse the expression in '{' '}' which is subject to case folding
    if not check(sh, "{"):
        sh.seek(pos)
        error_msg("Missing '{' for case fold expression.", sh)

    skip_whitespace(sh)
    if snap_set_expression != None:
        trigger_set = snap_set_expression(sh, PatternDict)
        if trigger_set == None:
            error_msg(
                "Missing character set for case fold in set expression.\n" +
                "The content in '\\C{content}' should start with '[' or '[:'.",
                sh)

        # -- perform the case fold for Sets!
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, flag_txt)
                for x in fold:
                    assert type(x) != list
                    trigger_set.add_interval(Interval(x, x + 1))

        result = trigger_set

    else:
        sm = snap_expression(sh, PatternDict)
        if sm == None:
            error_msg(
                "Missing expression for case fold '\C'.\n" +
                "The content in '\\C{content}' should start with '[' or '[:'.",
                sh)

        # -- perform the case fold for State Machines!
        for state_idx, state in sm.states.items():
            transitions = state.transitions()
            for target_state_idx, trigger_set in transitions.get_map().items():
                __add_case_fold(sm, flag_txt, trigger_set, state_idx,
                                target_state_idx)

        result = sm

    if not check(sh, "}"):
        sh.seek(pos)
        error_msg("Missing '}' for case fold expression.", sh)

    return result
コード例 #4
0
ファイル: indentation_setup.py プロジェクト: jirkamarsik/quex
def do(fh):
    """Parses pattern definitions of the form:
   
          [ \t]                                       => grid 4;
          [:intersection([:alpha:], [\X064-\X066]):]  => space 1;

       In other words the right hand side *must* be a character set.
          
    """
    indentation_setup = IndentationSetup(fh)

    # NOTE: Catching of EOF happens in caller: parse_section(...)
    #
    skip_whitespace(fh)

    while 1 + 1 == 2:
        skip_whitespace(fh)

        if check(fh, ">"):
            indentation_setup.seal()
            indentation_setup.consistency_check(fh)
            return indentation_setup

        # A regular expression state machine
        pattern_str, state_machine = regular_expression.parse(fh)

        skip_whitespace(fh)
        if not check(fh, "=>"):
            error_msg("Missing '=>' after character set definition.", fh)

        skip_whitespace(fh)
        identifier = read_identifier(fh)
        if identifier == "":
            error_msg("Missing identifier for indentation element definition.",
                      fh)

        verify_word_in_list(
            identifier, ["space", "grid", "bad", "newline", "suppressor"],
            "Unrecognized indentation specifier '%s'." % identifier, fh)

        trigger_set = None
        if identifier in ["space", "bad", "grid"]:
            if len(state_machine.states) != 2:
                error_msg("For indentation '%s' only patterns are addmissible which\n" % identifier + \
                          "can be matched by a single character, e.g. \" \" or [a-z].", fh)
            transition_map = state_machine.get_init_state().transitions(
            ).get_map()
            assert len(transition_map) == 1
            trigger_set = transition_map.values()[0]

        skip_whitespace(fh)
        if identifier == "space":
            value = read_integer(fh)
            if value != None:
                indentation_setup.specify_space(pattern_str, trigger_set,
                                                value, fh)
            else:
                # not a number received, is it an identifier?
                variable = read_identifier(fh)
                if variable != "":
                    indentation_setup.specify_space(pattern_str, trigger_set,
                                                    variable, fh)
                else:
                    indentation_setup.specify_space(pattern_str, trigger_set,
                                                    1, fh)

        elif identifier == "grid":
            value = read_integer(fh)
            if value != None:
                indentation_setup.specify_grid(pattern_str, trigger_set, value,
                                               fh)
            else:
                # not a number received, is it an identifier?
                skip_whitespace(fh)
                variable = read_identifier(fh)
                if variable != "":
                    indentation_setup.specify_grid(pattern_str, trigger_set,
                                                   variable, fh)
                else:
                    error_msg(
                        "Missing integer or variable name after keyword 'grid'.",
                        fh)

        elif identifier == "bad":
            indentation_setup.specify_bad(pattern_str, trigger_set, fh)

        elif identifier == "newline":
            indentation_setup.specify_newline(pattern_str, state_machine, fh)

        elif identifier == "suppressor":
            indentation_setup.specify_suppressor(pattern_str, state_machine,
                                                 fh)

        else:
            assert False, "Unreachable code reached."

        if not check(fh, ";"):
            error_msg(
                "Missing ';' after indentation '%s' specification." %
                identifier, fh)
コード例 #5
0
ファイル: indentation_setup.py プロジェクト: jirkamarsik/quex
def do(fh):
    """Parses pattern definitions of the form:
   
          [ \t]                                       => grid 4;
          [:intersection([:alpha:], [\X064-\X066]):]  => space 1;

       In other words the right hand side *must* be a character set.
          
    """
    indentation_setup = IndentationSetup(fh)

    # NOTE: Catching of EOF happens in caller: parse_section(...)
    #
    skip_whitespace(fh)

    while 1 + 1 == 2:
        skip_whitespace(fh)

        if check(fh, ">"):
            indentation_setup.seal()
            indentation_setup.consistency_check(fh)
            return indentation_setup

        # A regular expression state machine
        pattern_str, state_machine = regular_expression.parse(fh)

        skip_whitespace(fh)
        if not check(fh, "=>"):
            error_msg("Missing '=>' after character set definition.", fh)

        skip_whitespace(fh)
        identifier = read_identifier(fh)
        if identifier == "":
            error_msg("Missing identifier for indentation element definition.", fh)

        verify_word_in_list(
            identifier,
            ["space", "grid", "bad", "newline", "suppressor"],
            "Unrecognized indentation specifier '%s'." % identifier,
            fh,
        )

        trigger_set = None
        if identifier in ["space", "bad", "grid"]:
            if len(state_machine.states) != 2:
                error_msg(
                    "For indentation '%s' only patterns are addmissible which\n" % identifier
                    + 'can be matched by a single character, e.g. " " or [a-z].',
                    fh,
                )
            transition_map = state_machine.get_init_state().transitions().get_map()
            assert len(transition_map) == 1
            trigger_set = transition_map.values()[0]

        skip_whitespace(fh)
        if identifier == "space":
            value = read_integer(fh)
            if value != None:
                indentation_setup.specify_space(pattern_str, trigger_set, value, fh)
            else:
                # not a number received, is it an identifier?
                variable = read_identifier(fh)
                if variable != "":
                    indentation_setup.specify_space(pattern_str, trigger_set, variable, fh)
                else:
                    indentation_setup.specify_space(pattern_str, trigger_set, 1, fh)

        elif identifier == "grid":
            value = read_integer(fh)
            if value != None:
                indentation_setup.specify_grid(pattern_str, trigger_set, value, fh)
            else:
                # not a number received, is it an identifier?
                skip_whitespace(fh)
                variable = read_identifier(fh)
                if variable != "":
                    indentation_setup.specify_grid(pattern_str, trigger_set, variable, fh)
                else:
                    error_msg("Missing integer or variable name after keyword 'grid'.", fh)

        elif identifier == "bad":
            indentation_setup.specify_bad(pattern_str, trigger_set, fh)

        elif identifier == "newline":
            indentation_setup.specify_newline(pattern_str, state_machine, fh)

        elif identifier == "suppressor":
            indentation_setup.specify_suppressor(pattern_str, state_machine, fh)

        else:
            assert False, "Unreachable code reached."

        if not check(fh, ";"):
            error_msg("Missing ';' after indentation '%s' specification." % identifier, fh)
コード例 #6
0
def do(sh, PatternDict, snap_expression=None, snap_set_expression=None):
    """Parse a case fold expression of the form \C(..){ R } or \C{ R }.
       Assume that '\C' has been snapped already from the stream.

       See function ucs_case_fold_parser.get_fold_set() for details
       about case folding.

       snap_expression != None, then snap_expression is the function 
                                to parse a RE and the caller
                                expects a state machine.

       snap_set_expression != None, then snap_set_expression is the
                                    function to parse a character 
                                    set and caller expects a 
                                    NumberSet object.
    """

    pos = sh.tell()
    skip_whitespace(sh)
    # -- parse the optional options in '(' ')' brackets
    if not check(sh, "("):
        # By default 'single' and 'multi' character case folds are active
        if snap_set_expression != None: flag_txt = "s"
        else:                           flag_txt = "sm"

    else:
        flag_txt = read_until_character(sh, ")")

        if flag_txt == "":
            sh.seek(pos)
            error_msg("Missing closing ')' in case fold expression.", sh)

        flag_txt = flag_txt.replace(" ", "").replace("\t", "").replace("\n", "")

        for letter in flag_txt:
            if letter not in "smt":
                sh.seek(pos)
                error_msg("Letter '%s' not permitted as case fold option.\n" % letter + \
                          "Options are:  's' for simple case fold.\n" + \
                          "              'm' for multi character sequence case fold.\n" + \
                          "              't' for special turkish case fold rules.", sh)

            if snap_set_expression != None and letter == "m":
                sh.seek(pos)
                error_msg("Option 'm' not permitted as case fold option in set expression.\n" + \
                          "Set expressions cannot absorb multi character sequences.", sh)

        skip_whitespace(sh)

    # -- parse the expression in '{' '}' which is subject to case folding
    if not check(sh, "{"):
        sh.seek(pos)
        error_msg("Missing '{' for case fold expression.", sh)

    skip_whitespace(sh)
    if snap_set_expression != None:
        trigger_set = snap_set_expression(sh, PatternDict)
        if trigger_set == None:
            error_msg("Missing character set for case fold in set expression.\n" + 
                      "The content in '\\C{content}' should start with '[' or '[:'.", 
                      sh)

        # -- perform the case fold for Sets!
        for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
            for i in range(interval.begin, interval.end):
                fold = ucs_case_fold.get_fold_set(i, flag_txt)
                for x in fold:
                    assert type(x) != list
                    trigger_set.add_interval(Interval(x, x+1))

        result = trigger_set

    else:
        sm = snap_expression(sh, PatternDict)
        if sm == None:
            error_msg("Missing expression for case fold '\C'.\n" + 
                      "The content in '\\C{content}' should start with '[' or '[:'.", 
                      sh)

        # -- perform the case fold for State Machines!
        for state_idx, state in sm.states.items():
            transitions = state.transitions()
            for target_state_idx, trigger_set in transitions.get_map().items():
                __add_case_fold(sm, flag_txt, trigger_set, state_idx, target_state_idx)

        result = sm

    if not check(sh, "}"):
        sh.seek(pos)
        error_msg("Missing '}' for case fold expression.", sh)

    return result