Exemple #1
0
def __parse_section(fh, descriptor, already_defined_list):
    global token_type_code_fragment_db
    assert type(already_defined_list) == list

    SubsectionList = ["name", "file_name", "standard", "distinct", "union", "inheritable", "noid"] \
                      + token_type_code_fragment_db.keys()

    position = fh.tell()
    skip_whitespace(fh)
    word = read_identifier(fh)
    if word == "":
        fh.seek(position)
        if check(fh, "}"): 
            fh.seek(position) 
            return False
        error_msg("Missing token_type section ('standard', 'distinct', or 'union').", fh)

    verify_word_in_list(word, SubsectionList, 
                        "Subsection '%s' not allowed in token_type section." % word, fh)

    if word == "name":
        if not check(fh, "="):
            error_msg("Missing '=' in token_type 'name' specification.", fh)
        descriptor.class_name, descriptor.name_space, descriptor.class_name_safe = read_namespaced_name(fh, "token_type")
        if not check(fh, ";"):
            error_msg("Missing terminating ';' in token_type 'name' specification.", fh)

    elif word == "inheritable":
        descriptor.open_for_derivation_f = True
        check_or_die(fh, ";")

    elif word == "noid":
        descriptor.token_contains_token_id_f = False;
        check_or_die(fh, ";")

    elif word == "file_name":
        if not check(fh, "="):
            error_msg("Missing '=' in token_type 'file_name' specification.", fh)
        descriptor.set_file_name(read_until_letter(fh, ";"))
        if not check(fh, ";"):
            error_msg("Missing terminating ';' in token_type 'file_name' specification.", fh)

    elif word in ["standard", "distinct", "union"]:
        if   word == "standard": parse_standard_members(fh, word, descriptor, already_defined_list)
        elif word == "distinct": parse_distinct_members(fh, word, descriptor, already_defined_list)
        elif word == "union":    parse_union_members(fh, word, descriptor, already_defined_list)

        if not check(fh, "}"):
            fh.seek(position)
            error_msg("Missing closing '}' at end of token_type section '%s'." % word, fh);

    elif word in token_type_code_fragment_db.keys():
        fragment     = code_fragment.parse(fh, word, AllowBriefTokenSenderF=False)        
        descriptor.__dict__[word] = fragment

    else: 
        assert False, "This code section section should not be reachable because 'word'\n" + \
                      "was checked to fit in one of the 'elif' cases."

    return True
Exemple #2
0
def __parse_event(new_mode, fh, word):
    pos = fh.tell()

    # Allow '<<EOF>>' and '<<FAIL>>' out of respect for classical tools like 'lex'
    if   word == "<<EOF>>":                  word = "on_end_of_stream"
    elif word == "<<FAIL>>":                 word = "on_failure"
    elif word in blackboard.all_section_title_list:
        error_msg("Pattern '%s' is a quex section title. Has the closing '}' of mode %s \n" % (word, new_mode.name) \
                  + "been forgotten? Else use quotes, i.e. \"%s\"." % word, fh)
    elif len(word) < 3 or word[:3] != "on_": return False

    comment = "Unknown event handler '%s'. \n" % word + \
              "Note, that any pattern starting with 'on_' is considered an event handler.\n" + \
              "use double quotes to bracket patterns that start with 'on_'."

    __general_validate(fh, new_mode, word, pos)
    verify_word_in_list(word, event_handler_db.keys(), comment, fh)
    __validate_required_token_policy_queue(word, fh, pos)

    continue_f = True
    if word == "on_end_of_stream":
        # When a termination token is sent, no other token shall follow. 
        # => Enforce return from the analyzer! Do not allow CONTINUE!
        continue_f = False

    new_mode.events[word] = code_fragment.parse(fh, "%s::%s event handler" % (new_mode.name, word),
                                                ContinueF=continue_f)

    return True
Exemple #3
0
def parse(fh, new_mode):
    source_reference = SourceRef.from_FileHandle(fh)

    identifier = read_option_start(fh)
    if identifier is None: return False

    verify_word_in_list(identifier, mode_option_info_db.keys(),
                        "mode option", fh.name, get_current_line_info_number(fh))

    if   identifier == "skip":
        value = __parse_skip_option(fh, new_mode, identifier)

    elif identifier in ["skip_range", "skip_nested_range"]:
        value = __parse_range_skipper_option(fh, identifier, new_mode)
        
    elif identifier == "indentation":
        value = counter.parse_indentation(fh)
        value.set_containing_mode_name(new_mode.name)
        blackboard.required_support_indentation_count_set()

    elif identifier == "counter":
        value = counter.parse_line_column_counter(fh)

    elif identifier in ("entry", "exit", "restrict"):
        value = read_option_value(fh, ListF=True) # A 'list' of strings
    else:
        value = read_option_value(fh)             # A single string

    # Finally, set the option
    new_mode.option_db.enter(identifier, value, source_reference, new_mode.name)
    return True
Exemple #4
0
def __validate_definition(TheCodeFragment, NameStr, AlreadyMentionedList,
                          StandardMembersF):
    FileName = TheCodeFragment.sr.file_name
    LineN = TheCodeFragment.sr.line_n
    if StandardMembersF:
        verify_word_in_list(
            NameStr, TokenType_StandardMemberList,
            "Member name '%s' not allowed in token_type section 'standard'." %
            NameStr, FileName, LineN)

        # Standard Members are all numeric types
        if    TheCodeFragment.contains_string(Lng.Match_string) \
           or TheCodeFragment.contains_string(Lng.Match_vector) \
           or TheCodeFragment.contains_string(Lng.Match_map):
            type_str = TheCodeFragment.get_text()
            error_msg("Numeric type required.\n" + \
                      "Example: <token_id: uint16_t>, Found: '%s'\n" % type_str, FileName, LineN)
    else:
        if NameStr in TokenType_StandardMemberList:
            error_msg(
                "Member '%s' only allowed in 'standard' section." % NameStr,
                FileName, LineN)

    for candidate in AlreadyMentionedList:
        if candidate[0] != NameStr: continue
        error_msg("Token type member name '%s' defined twice." % NameStr,
                  FileName,
                  LineN,
                  DontExitF=True)
        error_msg("Previously defined here.", candidate[1].sr.file_name,
                  candidate[1].sr.line_n)
Exemple #5
0
def __validate_definition(TheCodeFragment, NameStr, 
                          AlreadyMentionedList, StandardMembersF):
    FileName = TheCodeFragment.sr.file_name
    LineN    = TheCodeFragment.sr.line_n
    if StandardMembersF:
        verify_word_in_list(NameStr, TokenType_StandardMemberList, 
                            "Member name '%s' not allowed in token_type section 'standard'." % NameStr, 
                            FileName, LineN)

        # Standard Members are all numeric types
        if    TheCodeFragment.contains_string(Lng.Match_string) \
           or TheCodeFragment.contains_string(Lng.Match_vector) \
           or TheCodeFragment.contains_string(Lng.Match_map):
            type_str = TheCodeFragment.get_text()
            error_msg("Numeric type required.\n" + \
                      "Example: <token_id: uint16_t>, Found: '%s'\n" % type_str, FileName, LineN)
    else:
        if NameStr in TokenType_StandardMemberList:
            error_msg("Member '%s' only allowed in 'standard' section." % NameStr,
                      FileName, LineN)

    for candidate in AlreadyMentionedList:
        if candidate[0] != NameStr: continue 
        error_msg("Token type member name '%s' defined twice." % NameStr,
                  FileName, LineN, DontExitF=True)
        error_msg("Previously defined here.",
                  candidate[1].sr.file_name, candidate[1].sr.line_n)
Exemple #6
0
def __start_mode(applicable_mode_name_list, mode_name_list):
    """If more then one mode is defined, then that requires an explicit 
       definition 'start = mode'.
    """
    assert len(applicable_mode_name_list) != 0

    start_mode = blackboard.initial_mode.get_pure_code()
    if start_mode == "":
        # Choose an applicable mode as start mode
        start_mode = applicable_mode_name_list[0]
        blackboard.initial_mode = CodeFragment(start_mode)
        if len(applicable_mode_name_list) > 1:
            error_msg("No initial mode defined via 'start' while more than one applicable mode exists.\n" + \
                      "Use for example 'start = %s;' in the quex source file to define an initial mode." \
                      % start_mode)
        # This Branch: start mode is applicable and present

    else:
        FileName = blackboard.initial_mode.filename
        LineN = blackboard.initial_mode.line_n
        # Start mode present and applicable?
        verify_word_in_list(start_mode, mode_name_list,
                            "Start mode '%s' is not defined." % start_mode,
                            FileName, LineN)
        verify_word_in_list(
            start_mode, applicable_mode_name_list,
            "Start mode '%s' is inheritable only and cannot be instantiated." %
            start_mode, FileName, LineN)
def __start_mode(applicable_mode_name_list, mode_name_list):
    """If more then one mode is defined, then that requires an explicit 
       definition 'start = mode'.
    """
    assert len(applicable_mode_name_list) != 0

    start_mode = blackboard.initial_mode.get_pure_code()
    if start_mode == "":
        # Choose an applicable mode as start mode
        start_mode              = applicable_mode_name_list[0]
        blackboard.initial_mode = CodeFragment(start_mode)
        if len(applicable_mode_name_list) > 1:
            error_msg("No initial mode defined via 'start' while more than one applicable mode exists.\n" + \
                      "Use for example 'start = %s;' in the quex source file to define an initial mode." \
                      % start_mode)
        # This Branch: start mode is applicable and present

    else: 
        FileName = blackboard.initial_mode.filename
        LineN    = blackboard.initial_mode.line_n
        # Start mode present and applicable?
        verify_word_in_list(start_mode, mode_name_list,
                            "Start mode '%s' is not defined." % start_mode,
                            FileName, LineN)
        verify_word_in_list(start_mode, applicable_mode_name_list,
                            "Start mode '%s' is inheritable only and cannot be instantiated." % start_mode,
                            FileName, LineN)
Exemple #8
0
def parse(fh, new_mode):
    source_reference = SourceRef.from_FileHandle(fh)

    identifier = read_option_start(fh)
    if identifier is None: return False

    verify_word_in_list(identifier, mode_option_info_db.keys(), "mode option",
                        fh.name, get_current_line_info_number(fh))

    if identifier == "skip":
        value = __parse_skip_option(fh, new_mode, identifier)

    elif identifier in ["skip_range", "skip_nested_range"]:
        value = __parse_range_skipper_option(fh, identifier, new_mode)

    elif identifier == "indentation":
        value = counter.parse_indentation(fh)
        value.set_containing_mode_name(new_mode.name)
        blackboard.required_support_indentation_count_set()

    elif identifier == "counter":
        value = counter.parse_line_column_counter(fh)

    elif identifier in ("entry", "exit", "restrict"):
        value = read_option_value(fh, ListF=True)  # A 'list' of strings
    else:
        value = read_option_value(fh)  # A single string

    # Finally, set the option
    new_mode.option_db.enter(identifier, value, source_reference,
                             new_mode.name)
    return True
Exemple #9
0
def __validate_definition(TypeCodeFragment, NameStr, 
                          AlreadyMentionedList, StandardMembersF):
    FileName = TypeCodeFragment.filename
    LineN    = TypeCodeFragment.line_n
    if StandardMembersF:
        verify_word_in_list(NameStr, TokenType_StandardMemberList, 
                            "Member name '%s' not allowed in token_type section 'standard'." % NameStr, 
                            FileName, LineN)

        # Standard Members are all numeric types
        TypeStr = TypeCodeFragment.get_pure_code()
        if    TypeStr.find("string") != -1 \
           or TypeStr.find("vector") != -1 \
           or TypeStr.find("map")    != -1:
            error_msg("Numeric type required.\n" + \
                      "Example: <token_id: uint16_t>, Found: '%s'\n" % TypeStr, FileName, LineN)
    else:
        if NameStr in TokenType_StandardMemberList:
            error_msg("Member '%s' only allowed in 'standard' section." % NameStr,
                      FileName, LineN)

    for candidate in AlreadyMentionedList:
        if candidate[0] != NameStr: continue 
        error_msg("Token type member name '%s' defined twice." % NameStr,
                  FileName, LineN, DontExitF=True)
        error_msg("Previously defined here.",
                  candidate[1].filename, candidate[1].line_n)
Exemple #10
0
def __parse_event(new_mode, fh, word):
    pos = fh.tell()

    # Allow '<<EOF>>' and '<<FAIL>>' out of respect for classical tools like 'lex'
    if word == "<<EOF>>": word = "on_end_of_stream"
    elif word == "<<FAIL>>": word = "on_failure"
    elif word in blackboard.all_section_title_list:
        error_msg("Pattern '%s' is a quex section title. Has the closing '}' of mode %s \n" % (word, new_mode.name) \
                  + "been forgotten? Else use quotes, i.e. \"%s\"." % word, fh)
    elif len(word) < 3 or word[:3] != "on_":
        return False

    comment = "Unknown event handler '%s'. \n" % word + \
              "Note, that any pattern starting with 'on_' is considered an event handler.\n" + \
              "use double quotes to bracket patterns that start with 'on_'."

    __general_validate(fh, new_mode, word, pos)
    verify_word_in_list(word, event_handler_db.keys(), comment, fh)
    __validate_required_token_policy_queue(word, fh, pos)

    continue_f = True
    if word == "on_end_of_stream":
        # When a termination token is sent, no other token shall follow.
        # => Enforce return from the analyzer! Do not allow CONTINUE!
        continue_f = False

    new_mode.events[word] = code_fragment.parse(fh,
                                                "%s::%s event handler" %
                                                (new_mode.name, word),
                                                ContinueF=continue_f)

    return True
Exemple #11
0
def __validate_definition(TypeCodeFragment, NameStr, AlreadyMentionedList,
                          StandardMembersF):
    FileName = TypeCodeFragment.filename
    LineN = TypeCodeFragment.line_n
    if StandardMembersF:
        verify_word_in_list(
            NameStr, TokenType_StandardMemberList,
            "Member name '%s' not allowed in token_type section 'standard'." %
            NameStr, FileName, LineN)

        # Standard Members are all numeric types
        TypeStr = TypeCodeFragment.get_pure_code()
        if    TypeStr.find("string") != -1 \
           or TypeStr.find("vector") != -1 \
           or TypeStr.find("map")    != -1:
            error_msg("Numeric type required.\n" + \
                      "Example: <token_id: uint16_t>, Found: '%s'\n" % TypeStr, FileName, LineN)
    else:
        if NameStr in TokenType_StandardMemberList:
            error_msg(
                "Member '%s' only allowed in 'standard' section." % NameStr,
                FileName, LineN)

    for candidate in AlreadyMentionedList:
        if candidate[0] != NameStr: continue
        error_msg("Token type member name '%s' defined twice." % NameStr,
                  FileName,
                  LineN,
                  DontExitF=True)
        error_msg("Previously defined here.", candidate[1].filename,
                  candidate[1].line_n)
Exemple #12
0
def snap_replacement(stream, PatternDict, StateMachineF=True):
    """Snaps a predefined pattern from the input string and returns the resulting
       state machine.
    """
    skip_whitespace(stream)
    pattern_name = read_identifier(stream)
    if pattern_name == "":
        raise RegularExpressionException(
            "Pattern replacement expression misses identifier after '{'.")
    skip_whitespace(stream)

    if not check(stream, "}"):
        raise RegularExpressionException("Pattern replacement expression misses closing '}' after '%s'." \
                                         % pattern_name)

    verify_word_in_list(
        pattern_name, PatternDict.keys(),
        "Specifier '%s' not found in any preceeding 'define { ... }' section."
        % pattern_name, stream)

    reference = PatternDict[pattern_name]
    assert reference.__class__.__name__ == "PatternShorthand"

    # The replacement may be a state machine or a number set
    if StateMachineF:
        # Get a cloned version of state machine
        state_machine = reference.get_state_machine()
        assert isinstance(state_machine, StateMachine)

        # It is essential that state machines defined as patterns do not
        # have origins. Otherwise, the optimization of patterns that
        # contain pattern replacements might get confused and can
        # not find all optimizations.
        assert state_machine.has_origins() == False

        # A state machine, that contains pre- or post- conditions cannot be part
        # of a replacement. The addition of new post-contexts would mess up the pattern.
        ## if state_machine.has_pre_or_post_context():
        ##    error_msg("Pre- or post-conditioned pattern was used in replacement.\n" + \
        ##              "Quex's regular expression grammar does not allow this.", stream)

        return state_machine

    else:
        # Get a cloned version of character set
        character_set = reference.get_character_set()
        if character_set is None:
            error_msg(
                "Replacement in character set expression must be a character set.\n"
                "Specifier '%s' relates to a pattern state machine." %
                pattern_name, stream)

        if character_set.is_empty():
            error_msg(
                "Referenced character set '%s' is empty.\nAborted." %
                pattern_name, stream)

        return character_set
Exemple #13
0
    def __determine_base_mode_sequence(self, ModeDescr, InheritancePath,
                                       base_mode_sequence):
        """Determine the sequence of base modes. The type of sequencing determines
           also the pattern precedence. The 'deep first' scheme is chosen here. For
           example a mode hierarchie of

                                       A
                                     /   \ 
                                    B     C
                                   / \   / \
                                  D  E  F   G

           results in a sequence: (A, B, D, E, C, F, G).reverse()

           => That is the mode itself is base_mode_sequence[-1]

           => Patterns and event handlers of 'E' have precedence over
              'C' because they are the childs of a preceding base mode.

           This function detects circular inheritance.

        __dive -- inserted this keyword for the sole purpose to signal 
                  that here is a case of recursion, which may be solved
                  later on by a TreeWalker.
        """
        if ModeDescr.name in InheritancePath:
            msg = "mode '%s'\n" % InheritancePath[0]
            for mode_name in InheritancePath[InheritancePath.index(ModeDescr.
                                                                   name) + 1:]:
                msg += "   inherits mode '%s'\n" % mode_name
            msg += "   inherits mode '%s'" % ModeDescr.name

            error_msg("circular inheritance detected:\n" + msg,
                      ModeDescr.sr.file_name, ModeDescr.sr.line_n)

        base_mode_name_list_reversed = deepcopy(ModeDescr.derived_from_list)
        #base_mode_name_list_reversed.reverse()
        for name in base_mode_name_list_reversed:
            # -- does mode exist?
            verify_word_in_list(
                name, blackboard.mode_description_db.keys(),
                "Mode '%s' inherits mode '%s' which does not exist." %
                (ModeDescr.name, name), ModeDescr.sr.file_name,
                ModeDescr.sr.line_n)

            if name in map(lambda m: m.name, base_mode_sequence): continue

            # -- grab the mode description
            mode_descr = blackboard.mode_description_db[name]
            self.__determine_base_mode_sequence(
                mode_descr, InheritancePath + [ModeDescr.name],
                base_mode_sequence)

        base_mode_sequence.append(ModeDescr)

        return base_mode_sequence
Exemple #14
0
def __get_distinct_codec_name_for_alias(CodecAlias, FH=-1, LineN=None):
    """Arguments FH and LineN correspond to the arguments of error_msg."""
    assert len(CodecAlias) != 0

    for record in get_codec_list_db():
        if CodecAlias in record[1] or CodecAlias == record[0]: 
            return record[0]

    verify_word_in_list(CodecAlias, get_supported_codec_list(), 
                        "Character encoding '%s' unknown to current version of quex." % CodecAlias,
                        FH, LineN)
Exemple #15
0
def _get_distinct_codec_name_for_alias(CodecAlias, FH=-1, LineN=None):
    """Arguments FH and LineN correspond to the arguments of error_msg."""
    assert len(CodecAlias) != 0

    for record in parser.get_codec_list_db():
        if CodecAlias in record[1] or CodecAlias == record[0]: 
            return record[0]

    verify_word_in_list(CodecAlias, get_supported_codec_list(), 
                        "Character encoding '%s' unknown to current version of quex." % CodecAlias,
                        FH, LineN)
Exemple #16
0
def get_codecs_for_language(Language):
    result = []
    for record in parser.get_codec_list_db():
        codec = record[0]
        if codec not in get_supported_codec_list(): continue
        if Language in record[2]: 
            result.append(record[0])
    if len(result) == 0:
        verify_word_in_list(Language, get_supported_language_list(),
                            "No codec found for language '%s'." % Language)
    return result
Exemple #17
0
def snap_set_term(stream, PatternDict):
    global special_character_set_db

    __debug_entry("set_term", stream)    

    operation_list     = [ "union", "intersection", "difference", "inverse"]
    character_set_list = special_character_set_db.keys()

    skip_whitespace(stream)
    position = stream.tell()

    # if there is no following '(', then enter the 'snap_expression' block below
    word = read_identifier(stream)

    if word in operation_list: 
        set_list = snap_set_list(stream, word, PatternDict)
        # if an error occurs during set_list parsing, an exception is thrown about syntax error

        L      = len(set_list)
        result = set_list[0]

        if word == "inverse":
            # The inverse of multiple sets, is to be the inverse of the union of these sets.
            if L > 1:
                for character_set in set_list[1:]:
                    result.unite_with(character_set)
            return __debug_exit(result.get_complement(Setup.buffer_codec.source_set), stream)

        if L < 2:
            raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \
                                             "two sets to operate on them.")
            
        if   word == "union":
            for set in set_list[1:]:
                result.unite_with(set)
        elif word == "intersection":
            for set in set_list[1:]:
                result.intersect_with(set)
        elif word == "difference":
            for set in set_list[1:]:
                result.subtract(set)

    elif word in character_set_list:
        reg_expr = special_character_set_db[word]
        result   = traditional_character_set.do_string(reg_expr)

    elif word != "":
        verify_word_in_list(word, character_set_list + operation_list, 
                            "Unknown keyword '%s'." % word, stream)
    else:
        stream.seek(position)
        result = snap_set_expression(stream, PatternDict)

    return __debug_exit(result, stream)
Exemple #18
0
def get_codecs_for_language(Language):
    result = []
    for record in parser.get_codec_list_db():
        codec = record[0]
        if codec not in get_supported_codec_list(): continue
        if Language in record[2]:
            result.append(record[0])
    if len(result) == 0:
        verify_word_in_list(Language, get_supported_language_list(),
                            "No codec found for language '%s'." % Language)
    return result
Exemple #19
0
def __entry_exit_transitions(mode, mode_name_list):
    FileName = mode.filename
    LineN = mode.line_n
    for mode_name in mode.options["exit"]:

        verify_word_in_list(mode_name, mode_name_list,
                            "Mode '%s' allows entry from\nmode '%s' but no such mode exists." % \
                            (mode.name, mode_name), FileName, LineN)

        that_mode = blackboard.mode_db[mode_name]

        # Other mode allows all entries => don't worry.
        if len(that_mode.options["entry"]) == 0: continue

        # Other mode restricts the entries from other modes
        # => check if this mode or one of the base modes can enter
        for base_mode in mode.get_base_mode_sequence():
            if base_mode.name in that_mode.options["entry"]: break
        else:
            error_msg("Mode '%s' has an exit to mode '%s' but" %
                      (mode.name, mode_name),
                      FileName,
                      LineN,
                      DontExitF=True,
                      WarningF=False)
            error_msg("mode '%s' has no entry for mode '%s'\n" % (mode_name, mode.name) + \
                      "or any of its base modes.",
                      that_mode.filename, that_mode.line_n)

    for mode_name in mode.options["entry"]:
        # Does that mode exist?
        verify_word_in_list(mode_name, mode_name_list,
                            "Mode '%s' allows entry from\nmode '%s' but no such mode exists." % \
                            (mode.name, mode_name), FileName, LineN)

        that_mode = blackboard.mode_db[mode_name]
        # Other mode allows all exits => don't worry.
        if len(that_mode.options["exit"]) == 0: continue

        # Other mode restricts the exits to other modes
        # => check if this mode or one of the base modes can be reached
        for base_mode in mode.get_base_mode_sequence():
            if base_mode.name in that_mode.options["exit"]: break
        else:
            error_msg("Mode '%s' has an entry for mode '%s' but" %
                      (mode.name, mode_name),
                      FileName,
                      LineN,
                      DontExitF=True,
                      WarningF=False)
            error_msg("mode '%s' has no exit to mode '%s'\n" % (mode_name, mode.name) + \
                      "or any of its base modes.",
                      that_mode.filename, that_mode.line_n)
Exemple #20
0
def snap_replacement(stream, PatternDict, StateMachineF=True):
    """Snaps a predefined pattern from the input string and returns the resulting
       state machine.
    """ 
    skip_whitespace(stream)
    pattern_name = read_identifier(stream)  
    if pattern_name == "":
        raise RegularExpressionException("Pattern replacement expression misses identifier after '{'.")
    skip_whitespace(stream)

    if not check(stream, "}"):
        raise RegularExpressionException("Pattern replacement expression misses closing '}' after '%s'." \
                                         % pattern_name)

    verify_word_in_list(pattern_name, PatternDict.keys(),
                        "Specifier '%s' not found in any preceeding 'define { ... }' section." % pattern_name, 
                        stream)

    reference = PatternDict[pattern_name]
    assert reference.__class__.__name__ == "PatternShorthand" 

    # The replacement may be a state machine or a number set
    if StateMachineF:
        # Get a cloned version of state machine
        state_machine = reference.get_state_machine()
        assert isinstance(state_machine, StateMachine)

        # It is essential that state machines defined as patterns do not 
        # have origins. Otherwise, the optimization of patterns that
        # contain pattern replacements might get confused and can
        # not find all optimizations.
        assert state_machine.has_origins() == False
            
        # A state machine, that contains pre- or post- conditions cannot be part
        # of a replacement. The addition of new post-contexts would mess up the pattern.
        ## if state_machine.has_pre_or_post_context():
        ##    error_msg("Pre- or post-conditioned pattern was used in replacement.\n" + \
        ##              "Quex's regular expression grammar does not allow this.", stream)
            
        return state_machine

    else:
        # Get a cloned version of character set
        character_set = reference.get_character_set()
        if character_set is None:
            error_msg("Replacement in character set expression must be a character set.\n"
                      "Specifier '%s' relates to a pattern state machine." % pattern_name, stream)

        if character_set.is_empty():
            error_msg("Referenced character set '%s' is empty.\nAborted." % pattern_name, stream)

        return character_set
Exemple #21
0
def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()
    
    start = fh.read(1)
    if start == "":  
        fh.seek(pos); return -1

    elif start == "'": 
        # read an utf-8 char an get the token-id
        # Example: '+'
        if check(fh, "\\"):
            # snap_backslashed_character throws an exception if 'backslashed char' is nonsense.
            character_code = snap_backslashed_character.do(fh, ReducedSetOfBackslashedCharactersF=True)
        else:
            character_code = __read_one_utf8_code_from_stream(fh)

        if character_code is None:
            error_msg("Missing utf8-character for definition of character code by character.", fh)

        elif fh.read(1) != '\'':
            error_msg("Missing closing ' for definition of character code by character.", fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C": fh.seek(pos); return -1
        # read Unicode Name 
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = __read_token_identifier(fh)
        if ucs_name == "": fh.seek(pos); return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            verify_word_in_list(ucs_name, ucs_property_db["Name"].code_point_db,
                                "The string %s\ndoes not identify a known unicode character." % ucs_name, 
                                fh)
        elif type(character_code) not in [int, long]:
            error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) 
        return character_code

    fh.seek(pos)
    character_code = read_integer(fh)
    if character_code is not None: return character_code

    # Try to interpret it as something else ...
    fh.seek(pos)
    return -1               
Exemple #22
0
    def __determine_base_mode_sequence(self, ModeDescr, InheritancePath, base_mode_sequence):
        """Determine the sequence of base modes. The type of sequencing determines
           also the pattern precedence. The 'deep first' scheme is chosen here. For
           example a mode hierarchie of

                                       A
                                     /   \ 
                                    B     C
                                   / \   / \
                                  D  E  F   G

           results in a sequence: (A, B, D, E, C, F, G).reverse()

           => That is the mode itself is base_mode_sequence[-1]

           => Patterns and event handlers of 'E' have precedence over
              'C' because they are the childs of a preceding base mode.

           This function detects circular inheritance.

        __dive -- inserted this keyword for the sole purpose to signal 
                  that here is a case of recursion, which may be solved
                  later on by a TreeWalker.
        """
        if ModeDescr.name in InheritancePath:
            msg = "mode '%s'\n" % InheritancePath[0]
            for mode_name in InheritancePath[InheritancePath.index(ModeDescr.name) + 1:]:
                msg += "   inherits mode '%s'\n" % mode_name
            msg += "   inherits mode '%s'" % ModeDescr.name

            error_msg("circular inheritance detected:\n" + msg, ModeDescr.sr.file_name, ModeDescr.sr.line_n)

        base_mode_name_list_reversed = deepcopy(ModeDescr.derived_from_list)
        #base_mode_name_list_reversed.reverse()
        for name in base_mode_name_list_reversed:
            # -- does mode exist?
            verify_word_in_list(name, blackboard.mode_description_db.keys(),
                                "Mode '%s' inherits mode '%s' which does not exist." % (ModeDescr.name, name),
                                ModeDescr.sr.file_name, ModeDescr.sr.line_n)

            if name in map(lambda m: m.name, base_mode_sequence): continue

            # -- grab the mode description
            mode_descr = blackboard.mode_description_db[name]
            self.__determine_base_mode_sequence(mode_descr, InheritancePath + [ModeDescr.name], base_mode_sequence)

        base_mode_sequence.append(ModeDescr)

        return base_mode_sequence
Exemple #23
0
    def __determine_base_mode_sequence(self, ModeDescr, InheritancePath):
        """Determine the sequence of base modes. The type of sequencing determines
           also the pattern precedence. The 'deep first' scheme is chosen here. For
           example a mode hierarchie of

                                       A
                                     /   \ 
                                    B     C
                                   / \   / \
                                  D  E  F   G

           results in a sequence: (A, B, D, E, C, F, G).reverse()

           This means, that patterns and event handlers of 'E' have precedence over
           'C' because they are the childs of a preceding base mode.

           This function detects circular inheritance.
        """
        if ModeDescr.name in InheritancePath:
            msg = "mode '%s'\n" % InheritancePath[0]
            for mode_name in InheritancePath[InheritancePath.index(ModeDescr.
                                                                   name) + 1:]:
                msg += "   inherits mode '%s'\n" % mode_name
            msg += "   inherits mode '%s'" % ModeDescr.name

            error_msg("circular inheritance detected:\n" + msg,
                      ModeDescr.filename, ModeDescr.line_n)

        base_mode_name_list_reversed = deepcopy(ModeDescr.base_modes)
        #base_mode_name_list_reversed.reverse()
        for name in base_mode_name_list_reversed:
            # -- does mode exist?
            verify_word_in_list(
                name, mode_description_db.keys(),
                "Mode '%s' inherits mode '%s' which does not exist." %
                (ModeDescr.name, name), ModeDescr.filename, ModeDescr.line_n)

            if name in map(lambda m: m.name, self.__base_mode_sequence):
                continue

            # -- grab the mode description
            mode_descr = mode_description_db[name]
            self.__determine_base_mode_sequence(
                mode_descr, InheritancePath + [ModeDescr.name])

        self.__base_mode_sequence.append(ModeDescr)

        return self.__base_mode_sequence
def __entry_exit_transitions(mode, mode_name_list):
    FileName = mode.filename
    LineN    = mode.line_n
    for mode_name in mode.options["exit"]:

        verify_word_in_list(mode_name, mode_name_list,
                            "Mode '%s' allows entry from\nmode '%s' but no such mode exists." % \
                            (mode.name, mode_name), FileName, LineN)

        that_mode = blackboard.mode_db[mode_name]

        # Other mode allows all entries => don't worry.
        if len(that_mode.options["entry"]) == 0: continue

        # Other mode restricts the entries from other modes
        # => check if this mode or one of the base modes can enter
        for base_mode in mode.get_base_mode_sequence():
            if base_mode.name in that_mode.options["entry"]: break
        else:
            error_msg("Mode '%s' has an exit to mode '%s' but" % (mode.name, mode_name),
                      FileName, LineN, DontExitF=True, WarningF=False)
            error_msg("mode '%s' has no entry for mode '%s'\n" % (mode_name, mode.name) + \
                      "or any of its base modes.",
                      that_mode.filename, that_mode.line_n)

    for mode_name in mode.options["entry"]:
        # Does that mode exist?
        verify_word_in_list(mode_name, mode_name_list,
                            "Mode '%s' allows entry from\nmode '%s' but no such mode exists." % \
                            (mode.name, mode_name), FileName, LineN)

        that_mode = blackboard.mode_db[mode_name]
        # Other mode allows all exits => don't worry.
        if len(that_mode.options["exit"]) == 0: continue

        # Other mode restricts the exits to other modes
        # => check if this mode or one of the base modes can be reached
        for base_mode in mode.get_base_mode_sequence():
            if base_mode.name in that_mode.options["exit"]: break
        else:
            error_msg("Mode '%s' has an entry for mode '%s' but" % (mode.name, mode_name),
                      FileName, LineN, DontExitF=True, WarningF=False)
            error_msg("mode '%s' has no exit to mode '%s'\n" % (mode_name, mode.name) + \
                      "or any of its base modes.",
                      that_mode.filename, that_mode.line_n)
Exemple #25
0
def __parse_definition_head(fh, result):

    if check(fh, "\\default"): 
        error_msg("'\\default' has been replaced by keyword '\\else' since quex 0.64.9!", fh)
    elif check(fh, "\\else"): 
        pattern = None
    else:                      
        pattern = regular_expression.parse(fh)

    skip_whitespace(fh)
    check_or_die(fh, "=>", " after character set definition.")

    skip_whitespace(fh)
    identifier = read_identifier(fh, OnMissingStr="Missing identifier for indentation element definition.")
    verify_word_in_list(identifier, result.identifier_list, 
                        "Unrecognized specifier '%s'." % identifier, fh)
    skip_whitespace(fh)

    return pattern, identifier, SourceRef.from_FileHandle(fh)
Exemple #26
0
    def __determine_base_mode_sequence(self, ModeDescr, InheritancePath):
        """Determine the sequence of base modes. The type of sequencing determines
           also the pattern precedence. The 'deep first' scheme is chosen here. For
           example a mode hierarchie of

                                       A
                                     /   \ 
                                    B     C
                                   / \   / \
                                  D  E  F   G

           results in a sequence: (A, B, D, E, C, F, G).reverse()

           This means, that patterns and event handlers of 'E' have precedence over
           'C' because they are the childs of a preceding base mode.

           This function detects circular inheritance.
        """
        if ModeDescr.name in InheritancePath:
            msg = "mode '%s'\n" % InheritancePath[0]
            for mode_name in InheritancePath[InheritancePath.index(ModeDescr.name) + 1:]:
                msg += "   inherits mode '%s'\n" % mode_name
            msg += "   inherits mode '%s'" % ModeDescr.name

            error_msg("circular inheritance detected:\n" + msg, ModeDescr.filename, ModeDescr.line_n)

        base_mode_name_list_reversed = deepcopy(ModeDescr.base_modes)
        #base_mode_name_list_reversed.reverse()
        for name in base_mode_name_list_reversed:
            # -- does mode exist?
            verify_word_in_list(name, mode_description_db.keys(),
                                "Mode '%s' inherits mode '%s' which does not exist." % (ModeDescr.name, name),
                                ModeDescr.filename, ModeDescr.line_n)

            if name in map(lambda m: m.name, self.__base_mode_sequence): continue

            # -- grab the mode description
            mode_descr = mode_description_db[name]
            self.__determine_base_mode_sequence(mode_descr, InheritancePath + [ModeDescr.name])

        self.__base_mode_sequence.append(ModeDescr)

        return self.__base_mode_sequence
Exemple #27
0
def __start_mode(implemented_mode_name_list, mode_name_list):
    """If more then one mode is defined, then that requires an explicit 
       definition 'start = mode'.
    """
    assert len(implemented_mode_name_list) != 0

    assert blackboard.initial_mode is not None

    start_mode = blackboard.initial_mode.get_pure_text()
    FileName   = blackboard.initial_mode.sr.file_name
    LineN      = blackboard.initial_mode.sr.line_n

    # Start mode present and applicable?
    verify_word_in_list(start_mode, mode_name_list,
                        "Start mode '%s' is not defined." % start_mode,
                        FileName, LineN)
    verify_word_in_list(start_mode, implemented_mode_name_list,
                        "Start mode '%s' is inheritable only and cannot be instantiated." % start_mode,
                        FileName, LineN)
Exemple #28
0
def __parse_definition_head(fh, result):

    if check(fh, "\\default"):
        error_msg(
            "'\\default' has been replaced by keyword '\\else' since quex 0.64.9!",
            fh)
    elif check(fh, "\\else"):
        pattern = None
    else:
        pattern = regular_expression.parse(fh)

    skip_whitespace(fh)
    check_or_die(fh, "=>", " after character set definition.")

    skip_whitespace(fh)
    identifier = read_identifier(
        fh,
        OnMissingStr="Missing identifier for indentation element definition.")
    verify_word_in_list(identifier, result.identifier_list,
                        "Unrecognized specifier '%s'." % identifier, fh)
    skip_whitespace(fh)

    return pattern, identifier, SourceRef.from_FileHandle(fh)
Exemple #29
0
def __perform_setup(command_line, argv):
    """RETURN:  True, if process needs to be started.
                False, if job is done.
    """
    global setup

    # (*) Classes and their namespace
    __setup_analyzer_class(setup)
    __setup_token_class(setup)
    __setup_token_id_prefix(setup)
    __setup_lexeme_null(setup)  # Requires 'token_class_name_space'

    # (*) Output programming language
    setup.language = setup.language.upper()
    verify_word_in_list(
        setup.language, quex_core_engine_generator_languages_db.keys(),
        "Programming language '%s' is not supported." % setup.language)
    setup.language_db = quex_core_engine_generator_languages_db[setup.language]
    setup.extension_db = global_extension_db[setup.language]

    # Is the output file naming scheme provided by the extension database
    # (Validation must happen immediately)
    if setup.extension_db.has_key(setup.output_file_naming_scheme) == False:
        error_msg("File extension scheme '%s' is not provided for language '%s'.\n" \
                  % (setup.output_file_naming_scheme, setup.language) + \
                  "Available schemes are: %s." % repr(setup.extension_db.keys())[1:-1])

    # Before file names can be prepared, determine the output directory
    # If 'source packaging' is enabled and no output directory is specified
    # then take the directory of the source packaging.
    if setup.source_package_directory != "" and setup.output_directory == "":
        setup.output_directory = setup.source_package_directory

    if setup.buffer_codec in ["utf8", "utf16"]:
        setup.buffer_codec_transformation_info = setup.buffer_codec + "-state-split"

    elif setup.buffer_codec_file != "":
        try:
            setup.buffer_codec = os.path.splitext(
                os.path.basename(setup.buffer_codec_file))[0]
        except:
            error_msg("cannot interpret string following '--codec-file'")

        setup.buffer_codec_transformation_info = codec_db.get_codec_transformation_info(
            FileName=setup.buffer_codec_file)

    elif setup.buffer_codec != "unicode":
        setup.buffer_codec_transformation_info = codec_db.get_codec_transformation_info(
            setup.buffer_codec)

    if setup.buffer_codec != "unicode":
        setup.buffer_element_size_irrelevant = True

    # (*) Output files
    if setup.language not in ["DOT"]:
        prepare_file_names(setup)

    if setup.buffer_byte_order == "<system>":
        setup.buffer_byte_order = sys.byteorder
        setup.byte_order_is_that_of_current_system_f = True
    else:
        setup.byte_order_is_that_of_current_system_f = False

    if setup.buffer_element_size == "wchar_t":
        error_msg(
            "Since Quex version 0.53.5, 'wchar_t' can no longer be specified\n"
            "with option '--buffer-element-size' or '-bes'. Please, specify\n"
            "'--buffer-element-type wchar_t' or '--bet'.")

    if setup.buffer_element_type == "wchar_t":
        setup.converter_ucs_coding_name = "WCHAR_T"

    make_numbers(setup)

    # (*) Determine buffer element type and size (in bytes)
    if setup.buffer_element_size == -1:
        if global_character_type_db.has_key(setup.buffer_element_type):
            setup.buffer_element_size = global_character_type_db[
                setup.buffer_element_type][3]
        elif setup.buffer_element_type == "":
            setup.buffer_element_size = 1
        else:
            # If the buffer element type is defined, then here we know that it is 'unknown'
            # and Quex cannot know its size on its own.
            setup.buffer_element_size = -1

    if setup.buffer_element_type == "":
        if setup.buffer_element_size in [1, 2, 4]:
            setup.buffer_element_type = {
                1: "uint8_t",
                2: "uint16_t",
                4: "uint32_t",
            }[setup.buffer_element_size]
        elif setup.buffer_element_size == -1:
            pass
        else:
            error_msg("Buffer element type cannot be determined for size '%i' which\n" \
                      % setup.buffer_element_size +
                      "has been specified by '-b' or '--buffer-element-size'.")

    setup.converter_f = False
    if setup.converter_iconv_f or setup.converter_icu_f:
        setup.converter_f = True

    # The only case where no converter helper is required is where ASCII
    # (Unicode restricted to [0, FF] is used.
    setup.converter_helper_required_f = True
    if setup.converter_f == False and setup.buffer_element_size == 1 and setup.buffer_codec == "unicode":
        setup.converter_helper_required_f = False

    validation.do(setup, command_line, argv)

    if setup.converter_ucs_coding_name == "":
        if global_character_type_db.has_key(setup.buffer_element_type):
            if setup.buffer_byte_order == "little": index = 1
            else: index = 2
            setup.converter_ucs_coding_name = global_character_type_db[
                setup.buffer_element_type][index]

    if setup.token_id_foreign_definition_file != "":
        CommentDelimiterList = [["//", "\n"], ["/*", "*/"]]
        # Regular expression to find '#include <something>' and extract the 'something'
        # in a 'group'. Note that '(' ')' cause the storage of parts of the match.
        IncludeRE = "#[ \t]*include[ \t]*[\"<]([^\">]+)[\">]"
        #
        parse_token_id_file(setup.token_id_foreign_definition_file,
                            setup.token_id_prefix, CommentDelimiterList,
                            IncludeRE)
        if setup.token_id_prefix_plain != setup.token_id_prefix:
            # The 'plain' name space less token indices are also supported
            parse_token_id_file(setup.token_id_foreign_definition_file,
                                setup.token_id_prefix_plain,
                                CommentDelimiterList, IncludeRE)

    # (*) Compression Types
    compression_type_list = []
    for name, ctype in [
        ("compression_template_f", E_Compression.TEMPLATE),
        ("compression_template_uniform_f", E_Compression.TEMPLATE_UNIFORM),
        ("compression_path_f", E_Compression.PATH),
        ("compression_path_uniform_f", E_Compression.PATH_UNIFORM)
    ]:
        if command_line_args_defined(command_line, name):
            compression_type_list.append(
                (command_line_arg_position(name), ctype))
    compression_type_list.sort(key=itemgetter(0))
    setup.compression_type_list = map(lambda x: x[1], compression_type_list)

    # (*) return setup ___________________________________________________________________
    return True
Exemple #30
0
def __create_token_sender_by_token_name(fh, TokenName):
    assert type(TokenName) in [str, unicode]

    # Enter token_id into database, if it is not yet defined.
    token_id_db_verify_or_enter_token_id(fh, TokenName)

    # Parse the token argument list
    argument_list = __parse_function_argument_list(fh, TokenName)

    # Create the token sender
    explicit_member_names_f = False
    for arg in argument_list:
        if arg.find("=") != -1: explicit_member_names_f = True

    assert blackboard.token_type_definition is not None, \
           "A valid token_type_definition must have been parsed at this point."

    if not explicit_member_names_f:
        # There are only two allowed cases for implicit token member names:
        #  QUEX_TKN_XYZ(Lexeme)     --> call take_text(Lexeme, LexemeEnd)
        #  QUEX_TKN_XYZ(Begin, End) --> call to take_text(Begin, End)
        if len(argument_list) == 2:
            return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, (%s), (%s));\n" % \
                   (argument_list[0], argument_list[1]) + \
                   "self_send(%s);\n" % (TokenName)

        elif len(argument_list) == 1:
            if argument_list[0] == "Lexeme":
                return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, self.buffer._lexeme_start_p, self.buffer._input_p);\n" \
                       "self_send(%s);\n" % (TokenName)
            elif argument_list[0] == "LexemeNull":
                return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, LexemeNull, LexemeNull);\n" \
                       "self_send(%s);\n" % (TokenName)
            else:
                error_msg("If one unnamed argument is specified it must be 'Lexeme'\n"          + \
                          "or 'LexemeNull'. Found '%s'.\n" % argument_list[0]                     + \
                          "To cut parts of the lexeme, please, use the 2 argument sender, e.g.\n" + \
                          "QUEX_TKN_MY_ID(Lexeme + 1, LexemeEnd - 2);\n"                             + \
                          "Alternatively, use named parameters such as 'number=...'.", fh)

        elif len(argument_list) == 0:
            return "self_send(%s);\n" % TokenName

        else:
            error_msg(
                "Since 0.49.1, there are only the following brief token senders that can take\n"
                "unnamed token arguments:\n"
                "     one argument:   'Lexeme'   =>  token.take_text(..., LexemeBegin, LexemeEnd);\n"
                "     two arguments:  Begin, End =>  token.take_text(..., Begin, End);\n"
                + "Found: " + repr(argument_list)[1:-1] + ".", fh)

        # Returned from Function if implicit member names

    member_value_pairs = map(lambda x: x.split("="), argument_list)
    txt = ""
    for member, value in member_value_pairs:
        if value == "":
            error_msg("One explicit argument name mentioned requires all arguments to\n"  + \
                      "be mentioned explicitly. Value '%s' mentioned without argument.\n"   \
                      % member, fh)

        if Setup.token_class_file != "":
            error_msg("Member assignments in brief token senders are inadmissible\n" + \
                      "with manually written token classes. User provided file '%s'.\n" % Setup.token_class_file + \
                      "Found member assignment: '%s' = '%s'." % (member, value), fh)
        else:
            member_name = member.strip()
            verify_word_in_list(
                member_name, blackboard.token_type_definition.get_member_db(),
                "No member:   '%s' in token type description." % member_name,
                fh)
            idx = value.find("Lexeme")
            if idx != -1:
                if idx != 0 and value[idx - 1] == "(":
                    pass
                else:
                    error_msg(
                        "Assignment of token member '%s' with 'Lexeme' directly being involved. The\n"
                        % member_name +
                        "'Lexeme' points into the text buffer and it is not owned by the token object.\n"
                        "\n"
                        "Proposals:\n\n"
                        "   (1) Use '(Lexeme)', i.e. surround 'Lexeme' by brackets to indicate\n"
                        "       that you are aware of the danger. Do this, if at the end of the\n"
                        "       process, the member can be assumed to relate to an object that\n"
                        "       is not directly dependent anymore on 'Lexeme'. This is particularly\n"
                        "       true if the member is of type 'std::string'. Its constructor\n"
                        "       creates a copy of the zero terminated string.\n\n"
                        "   (2) Use token senders without named arguments, for example\n"
                        "          \"%s(Lexeme+1, LexemeEnd-2)\"\n" % TokenName
                        + "          \"%s(Lexeme)\"\n" % TokenName +
                        "       These token senders create a copy of the lexeme and let the token\n"
                        "       own it.", fh)

            access = blackboard.token_type_definition.get_member_access(
                member_name)
            txt += "self_write_token_p()->%s = %s;\n" % (access, value.strip())

    # Box the token, stamp it with an id and 'send' it
    txt += "self_send(%s);\n" % TokenName
    return txt
Exemple #31
0
def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()

    start = fh.read(1)
    if start == "":
        fh.seek(pos)
        return -1

    elif start == "'":
        # read an utf-8 char an get the token-id
        # Example: '+'
        if check(fh, "\\"):
            # snap_backslashed_character throws an exception if 'backslashed char' is nonsense.
            character_code = snap_backslashed_character.do(
                fh, ReducedSetOfBackslashedCharactersF=True)
        else:
            character_code = __read_one_utf8_code_from_stream(fh)

        if character_code is None:
            error_msg(
                "Missing utf8-character for definition of character code by character.",
                fh)

        elif fh.read(1) != '\'':
            error_msg(
                "Missing closing ' for definition of character code by character.",
                fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C":
            fh.seek(pos)
            return -1
        # read Unicode Name
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = __read_token_identifier(fh)
        if ucs_name == "":
            fh.seek(pos)
            return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            verify_word_in_list(
                ucs_name, ucs_property_db["Name"].code_point_db,
                "The string %s\ndoes not identify a known unicode character." %
                ucs_name, fh)
        elif type(character_code) not in [int, long]:
            error_msg(
                "%s relates to more than one character in unicode database." %
                ucs_name, fh)
        return character_code

    fh.seek(pos)
    character_code = read_integer(fh)
    if character_code is not None: return character_code

    # Try to interpret it as something else ...
    fh.seek(pos)
    return -1
def do(fh):
    """Parses pattern definitions of the form:
   
          [ \t]                                       => grid 4;
          [:intersection([:alpha:], [\X064-\X066]):]  => space 1;

       In other words the right hand side *must* be a character set.
          
    """
    indentation_setup = IndentationSetup(fh)

    # NOTE: Catching of EOF happens in caller: parse_section(...)
    #
    skip_whitespace(fh)

    while 1 + 1 == 2:
        skip_whitespace(fh)

        if check(fh, ">"): 
            indentation_setup.seal()
            indentation_setup.consistency_check(fh)
            return indentation_setup
        
        # A regular expression state machine
        pattern_str, pattern = regular_expression.parse(fh)

        skip_whitespace(fh)
        if not check(fh, "=>"):
            error_msg("Missing '=>' after character set definition.", fh)

        skip_whitespace(fh)
        identifier = read_identifier(fh)
        if identifier == "":
            error_msg("Missing identifier for indentation element definition.", fh)

        verify_word_in_list(identifier, 
                            ["space", "grid", "bad", "newline", "suppressor"],
                            "Unrecognized indentation specifier '%s'." % identifier, fh)

        trigger_set = None
        if identifier in ["space", "bad", "grid"]:
            if len(pattern.sm.states) != 2:
                error_msg("For indentation '%s' only patterns are addmissible which\n" % identifier + \
                          "can be matched by a single character, e.g. \" \" or [a-z].", fh)
            transition_map = pattern.sm.get_init_state().transitions().get_map()
            assert len(transition_map) == 1
            trigger_set = transition_map.values()[0]

        skip_whitespace(fh)
        if identifier == "space":
            value = read_integer(fh)
            if value is not None: 
                indentation_setup.specify_space(pattern_str, trigger_set, value, fh)
            else:
                # not a number received, is it an identifier?
                variable = read_identifier(fh)
                if variable != "":
                    indentation_setup.specify_space(pattern_str, trigger_set, variable, fh)
                else:
                    indentation_setup.specify_space(pattern_str, trigger_set, 1, fh)

        elif identifier == "grid":
            value = read_integer(fh)
            if value is not None: 
                indentation_setup.specify_grid(pattern_str, trigger_set, value, fh)
            else:
                # not a number received, is it an identifier?
                skip_whitespace(fh)
                variable = read_identifier(fh)
                if variable != "":
                    indentation_setup.specify_grid(pattern_str, trigger_set, variable, fh)
                else:
                    error_msg("Missing integer or variable name after keyword 'grid'.", fh) 

        elif identifier == "bad":
            indentation_setup.specify_bad(pattern_str, trigger_set, fh)

        elif identifier == "newline":
            indentation_setup.specify_newline(pattern_str, pattern.sm, fh)

        elif identifier == "suppressor":
            indentation_setup.specify_suppressor(pattern_str, pattern.sm, fh)

        else:
            assert False, "Unreachable code reached."

        if not check(fh, ";"):
            error_msg("Missing ';' after indentation '%s' specification." % identifier, fh)
Exemple #33
0
def __parse_option(fh, new_mode):
    def get_pattern_object(SM):
        if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM)
        else: result = SM
        result = hopcroft.do(result, CreateNewStateMachineF=False)
        return Pattern(result, AllowStateMachineTrafoF=True)

    identifier = read_option_start(fh)
    if identifier is None: return False

    verify_word_in_list(identifier, mode_option_info_db.keys(), "mode option",
                        fh.name, get_current_line_info_number(fh))

    if identifier == "skip":
        # A skipper 'eats' characters at the beginning of a pattern that belong
        # to a specified set of characters. A useful application is most probably
        # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to
        # implement a very effective way to skip these regions.
        pattern_str, trigger_set = regular_expression.parse_character_set(
            fh, PatternStringF=True)
        skip_whitespace(fh)

        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'." % identifier,
                      fh)

        if trigger_set.is_empty():
            error_msg("Empty trigger set for skipper." % identifier, fh)

        # TriggerSet skipping is implemented the following way: As soon as one element of the
        # trigger set appears, the state machine enters the 'trigger set skipper section'.
        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)'
        pattern_sm = StateMachine()
        pattern_sm.add_transition(pattern_sm.init_state_index,
                                  trigger_set,
                                  AcceptanceF=True)

        # Skipper code is to be generated later
        action = GeneratedCode(skip_character_set.do,
                               FileName=fh.name,
                               LineN=get_current_line_info_number(fh))
        action.data["character_set"] = trigger_set

        new_mode.add_match(pattern_str,
                           action,
                           get_pattern_object(pattern_sm),
                           Comment=E_SpecialPatterns.SKIP)

        return True

    elif identifier in ["skip_range", "skip_nested_range"]:
        # A non-nesting skipper can contain a full fledged regular expression as opener,
        # since it only effects the trigger. Not so the nested range skipper-see below.

        # -- opener
        skip_whitespace(fh)
        if identifier == "skip_nested_range":
            # Nested range state machines only accept 'strings' not state machines
            opener_str, opener_sequence = __parse_string(
                fh, "Opener pattern for 'skip_nested_range'")
            opener_sm = StateMachine.from_sequence(opener_sequence)
        else:
            opener_str, opener_pattern = regular_expression.parse(fh)
            opener_sm = opener_pattern.sm
            # For 'range skipping' the opener sequence is not needed, only the opener state
            # machine is webbed into the pattern matching state machine.
            opener_sequence = None

        skip_whitespace(fh)

        # -- closer
        closer_str, closer_sequence = __parse_string(
            fh, "Closing pattern for 'skip_range' or 'skip_nested_range'")
        skip_whitespace(fh)
        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'" % identifier,
                      fh)

        # Skipper code is to be generated later
        generator_function, comment = {
            "skip_range": (skip_range.do, E_SpecialPatterns.SKIP_RANGE),
            "skip_nested_range":
            (skip_nested_range.do, E_SpecialPatterns.SKIP_NESTED_RANGE),
        }[identifier]
        action = GeneratedCode(generator_function,
                               FileName=fh.name,
                               LineN=get_current_line_info_number(fh))

        action.data["opener_sequence"] = opener_sequence
        action.data["closer_sequence"] = closer_sequence
        action.data["mode_name"] = new_mode.name

        new_mode.add_match(opener_str,
                           action,
                           get_pattern_object(opener_sm),
                           Comment=comment)

        return True

    elif identifier == "indentation":
        value = indentation_setup.do(fh)

        # Enter 'Newline' and 'Suppressed Newline' as matches into the engine.
        # Similar to skippers, the indentation count is then triggered by the newline.
        # -- Suppressed Newline = Suppressor followed by Newline,
        #    then newline does not trigger indentation counting.
        suppressed_newline_pattern_str = ""
        if value.newline_suppressor_state_machine.get() is not None:
            suppressed_newline_pattern_str = \
                  "(" + value.newline_suppressor_state_machine.pattern_string() + ")" \
                + "(" + value.newline_state_machine.pattern_string() + ")"

            suppressed_newline_sm = \
                sequentialize.do([value.newline_suppressor_state_machine.get(),
                                  value.newline_state_machine.get()])

            FileName = value.newline_suppressor_state_machine.file_name
            LineN = value.newline_suppressor_state_machine.line_n
            # Go back to start.
            code = UserCodeFragment("goto %s;" % get_label("$start", U=True),
                                    FileName, LineN)

            new_mode.add_match(
                suppressed_newline_pattern_str,
                code,
                get_pattern_object(suppressed_newline_sm),
                Comment=E_SpecialPatterns.SUPPRESSED_INDENTATION_NEWLINE)

        # When there is an empty line, then there shall be no indentation count on it.
        # Here comes the trick:
        #
        #      Let               newline
        #      be defined as:    newline ([space]* newline])*
        #
        # This way empty lines are eating away before the indentation count is activated.

        # -- 'space'
        x0 = StateMachine()
        x0.add_transition(x0.init_state_index,
                          value.indentation_count_character_set(),
                          AcceptanceF=True)
        # -- '[space]*'
        x1 = repeat.do(x0)
        # -- '[space]* newline'
        x2 = sequentialize.do([x1, value.newline_state_machine.get()])
        # -- '([space]* newline)*'
        x3 = repeat.do(x2)
        # -- 'newline ([space]* newline)*'
        x4 = sequentialize.do([value.newline_state_machine.get(), x3])
        # -- nfa to dfa; hopcroft optimization
        sm = beautifier.do(x4)

        FileName = value.newline_state_machine.file_name
        LineN = value.newline_state_machine.line_n
        action = GeneratedCode(indentation_counter.do, FileName, LineN)

        action.data["indentation_setup"] = value

        new_mode.add_match(value.newline_state_machine.pattern_string(),
                           action,
                           get_pattern_object(sm),
                           Comment=E_SpecialPatterns.INDENTATION_NEWLINE)

        # Announce the mode to which the setup belongs
        value.set_containing_mode_name(new_mode.name)
    else:
        value = read_option_value(fh)

    # The 'verify_word_in_list()' call must have ensured that the following holds
    assert mode_option_info_db.has_key(identifier)

    # Is the option of the appropriate value?
    option_info = mode_option_info_db[identifier]
    if option_info.domain is not None and value not in option_info.domain:
        error_msg("Tried to set value '%s' for option '%s'. " % (value, identifier) + \
                  "Though, possible for this option are only: %s." % repr(option_info.domain)[1:-1], fh)

    # Finally, set the option
    new_mode.add_option(identifier, value)

    return True
Exemple #34
0
def __create_token_sender_by_token_name(fh, TokenName):
    assert type(TokenName) in [str, unicode]

    # Enter token_id into database, if it is not yet defined.
    token_id_db_verify_or_enter_token_id(fh, TokenName)

    # Parse the token argument list
    argument_list = __parse_function_argument_list(fh, TokenName)

    # Create the token sender
    explicit_member_names_f = False
    for arg in argument_list:
        if arg.find("=") != -1: explicit_member_names_f = True

    assert blackboard.token_type_definition is not None, \
           "A valid token_type_definition must have been parsed at this point."

    if not explicit_member_names_f:
        # There are only two allowed cases for implicit token member names:
        #  QUEX_TKN_XYZ(Lexeme)     --> call take_text(Lexeme, LexemeEnd)
        #  QUEX_TKN_XYZ(Begin, End) --> call to take_text(Begin, End)
        if   len(argument_list) == 2:
            return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, (%s), (%s));\n" % \
                   (argument_list[0], argument_list[1]) + \
                   "self_send(%s);\n" % (TokenName)

        elif len(argument_list) == 1:
            if argument_list[0] == "Lexeme":
                return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, self.buffer._lexeme_start_p, self.buffer._input_p);\n" \
                       "self_send(%s);\n" % (TokenName)
            elif argument_list[0] == "LexemeNull":
                return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, LexemeNull, LexemeNull);\n" \
                       "self_send(%s);\n" % (TokenName)
            else:
                error_msg("If one unnamed argument is specified it must be 'Lexeme'\n"          + \
                          "or 'LexemeNull'. Found '%s'.\n" % argument_list[0]                     + \
                          "To cut parts of the lexeme, please, use the 2 argument sender, e.g.\n" + \
                          "QUEX_TKN_MY_ID(Lexeme + 1, LexemeEnd - 2);\n"                             + \
                          "Alternatively, use named parameters such as 'number=...'.", fh)

        elif len(argument_list) == 0:
            return "self_send(%s);\n" % TokenName

        else:
            error_msg("Since 0.49.1, there are only the following brief token senders that can take\n"
                      "unnamed token arguments:\n"
                      "     one argument:   'Lexeme'   =>  token.take_text(..., LexemeBegin, LexemeEnd);\n"
                      "     two arguments:  Begin, End =>  token.take_text(..., Begin, End);\n"
                      + "Found: " + repr(argument_list)[1:-1] + ".", fh)

        # Returned from Function if implicit member names

    member_value_pairs = map(lambda x: x.split("="), argument_list)
    txt = ""
    for member, value in member_value_pairs:
        if value == "":
            error_msg("One explicit argument name mentioned requires all arguments to\n"  + \
                      "be mentioned explicitly. Value '%s' mentioned without argument.\n"   \
                      % member, fh)

        if Setup.token_class_file != "":
            error_msg("Member assignments in brief token senders are inadmissible\n" + \
                      "with manually written token classes. User provided file '%s'.\n" % Setup.token_class_file + \
                      "Found member assignment: '%s' = '%s'." % (member, value), fh)
        else:
            member_name = member.strip()
            verify_word_in_list(member_name, blackboard.token_type_definition.get_member_db(), 
                                "No member:   '%s' in token type description." % member_name, 
                                fh)
            idx = value.find("Lexeme")
            if idx != -1:
                if idx != 0 and value[idx-1] == "(":
                    pass
                else:
                    error_msg("Assignment of token member '%s' with 'Lexeme' directly being involved. The\n" % member_name + 
                              "'Lexeme' points into the text buffer and it is not owned by the token object.\n"
                              "\n"
                              "Proposals:\n\n"
                              "   (1) Use '(Lexeme)', i.e. surround 'Lexeme' by brackets to indicate\n"
                              "       that you are aware of the danger. Do this, if at the end of the\n"
                              "       process, the member can be assumed to relate to an object that\n"
                              "       is not directly dependent anymore on 'Lexeme'. This is particularly\n"
                              "       true if the member is of type 'std::string'. Its constructor\n"
                              "       creates a copy of the zero terminated string.\n\n"
                              "   (2) Use token senders without named arguments, for example\n"
                              "          \"%s(Lexeme+1, LexemeEnd-2)\"\n" % TokenName + 
                              "          \"%s(Lexeme)\"\n" % TokenName + 
                              "       These token senders create a copy of the lexeme and let the token\n"
                              "       own it.", fh)

            access = blackboard.token_type_definition.get_member_access(member_name)
            txt += "self_write_token_p()->%s = %s;\n" % (access, value.strip())


    # Box the token, stamp it with an id and 'send' it
    txt += "self_send(%s);\n" % TokenName
    return txt
Exemple #35
0
def parse_section(fh):
    global default_token_type_definition_triggered_by_mode_definition_f

    # NOTE: End of File is supposed to be reached when trying to read a new
    #       section. Thus, the end-of-file catcher does not encompass the beginning.
    position = fh.tell()
    skip_whitespace(fh)
    word = read_identifier(fh, OnMissingStr="Missing section title")

    verify_word_in_list(word, blackboard.all_section_title_list, 
                        "Unknown quex section '%s'" % word, fh)
    try:
        # (*) determine what is defined
        #
        #     -- 'mode { ... }'     => define a mode
        #     -- 'start = ...;'     => define the name of the initial mode
        #     -- 'header { ... }'   => define code that is to be pasted on top
        #                              of the engine (e.g. "#include<...>")
        #     -- 'body { ... }'     => define code that is to be pasted in the class' body
        #                              of the engine (e.g. "public: int  my_member;")
        #     -- 'init { ... }'     => define code that is to be pasted in the class' constructors
        #                              of the engine (e.g. "my_member = -1;")
        #     -- 'define { ... }'   => define patterns shorthands such as IDENTIFIER for [a-z]+
        #     -- 'repeated_token_id = QUEX_TKN_ ...;' => enables token repetition, defines
        #                                                the token id to be repeated.
        #     -- 'token { ... }'    => define token ids
        #     -- 'token_type { ... }'  => define a customized token type
        #
        if word in blackboard.fragment_db.keys():
            element_name = blackboard.fragment_db[word]
            fragment     = code_fragment.parse(fh, word, AllowBriefTokenSenderF=False)        
            blackboard.__dict__[element_name] = fragment
            return

        elif word == "start":
            mode_name = parse_identifier_assignment(fh)
            if mode_name == "":
                error_msg("Missing mode_name after 'start ='", fh)

            elif not blackboard.initial_mode.sr.is_void():
                error_msg("start mode defined more than once!", fh, DontExitF=True)
                error_msg("previously defined here", blackboard.initial_mode.sr)
             
            blackboard.initial_mode = CodeUser(mode_name, SourceRef.from_FileHandle(fh))
            return

        elif word == "repeated_token":
            blackboard.token_repetition_token_id_list = parse_token_id_definitions(fh, NamesOnlyF=True)
            for token_name in blackboard.token_repetition_token_id_list:
                verify_word_in_list(token_name[len(Setup.token_id_prefix):],
                                    blackboard.token_id_db.keys(),
                                    "Token ID '%s' not yet defined." % token_name,
                                    fh, ExitF=False, 
                                    SuppressCode=NotificationDB.warning_repeated_token_not_yet_defined)
            return
            
        elif word == "define":
            parse_pattern_name_definitions(fh)
            return

        elif word == "token":       
            if Setup.token_id_foreign_definition:
                error_msg("Token id file '%s' has been specified.\n" \
                          % Setup.token_id_foreign_definition_file \
                          + "All token ids must be specified there. Section 'token'\n" \
                          + "is not allowed.", fh)

            parse_token_id_definitions(fh)
            return

        elif word == "token_type":       

            if Setup.token_class_file != "":
                error_msg("Section 'token_type' is intended to generate a token class.\n" \
                          + "However, the manually written token class file '%s'" \
                          % repr(Setup.token_class_file) \
                          + "has been specified on the command line.", fh)
       
            if blackboard.token_type_definition is None:
                blackboard.token_type_definition = token_type.parse(fh)
                return

            # Error case:
            if default_token_type_definition_triggered_by_mode_definition_f:
                error_msg("Section 'token_type' must appear before first mode definition.", fh)
            else:
                error_msg("Section 'token_type' has been defined twice.", fh, DontExitF=True)
                error_msg("Previously defined here.",
                          blackboard.token_type_definition.sr.file_name,
                          blackboard.token_type_definition.sr.line_n)
            return

        elif word == "mode":
            # When the first mode is parsed then a token_type definition must be 
            # present. If not, the default token type definition is considered.
            if blackboard.token_type_definition is None:
                parse_default_token_definition()
                default_token_type_definition_triggered_by_mode_definition_f = True

            mode.parse(fh)
            return

        else:
            # This case should have been caught by the 'verify_word_in_list' function
            assert False

    except EndOfStreamException:
        fh.seek(position)
        error_eof(word, fh)
Exemple #36
0
def prepare(command_line, argv):
    """RETURN:  True, if process needs to be started.
                False, if job is done.
    """
    global Setup

    # (*) Classes and their namespace
    __setup_analyzer_class(Setup)
    __setup_token_class(Setup)
    __setup_token_id_prefix(Setup)
    __setup_lexeme_null(Setup)       # Requires 'token_class_name_space'

    # (*) Output programming language        
    Setup.language = Setup.language.upper()
    verify_word_in_list(Setup.language, output_language_db.keys(),
                        "Programming language '%s' is not supported." % Setup.language)
    Setup.language_db  = output_language_db[Setup.language]
    Setup.extension_db = global_extension_db[Setup.language]

    # Is the output file naming scheme provided by the extension database
    # (Validation must happen immediately)
    if Setup.extension_db.has_key(Setup.output_file_naming_scheme) == False:
        error_msg("File extension scheme '%s' is not provided for language '%s'.\n" \
                  % (Setup.output_file_naming_scheme, Setup.language) + \
                  "Available schemes are: %s." % repr(Setup.extension_db.keys())[1:-1])

    # (*) Output files
    if   Setup.buffer_codec_name == "utf8":  module = utf8_state_split
    elif Setup.buffer_codec_name == "utf16": module = utf16_state_split
    else:                                    module = None
    Setup.buffer_codec_prepare(Setup.buffer_codec_name, 
                               Setup.buffer_codec_file, module)

    # AFTER: Setup.buffer_codec_prepare() !!!
    if Setup.language not in ["DOT"]:
        prepare_file_names(Setup)

    if Setup.buffer_byte_order == "<system>": 
        Setup.buffer_byte_order = sys.byteorder 
        Setup.byte_order_is_that_of_current_system_f = True
    else:
        Setup.byte_order_is_that_of_current_system_f = False

    if Setup.buffer_element_size == "wchar_t":
        error_msg("Since Quex version 0.53.5, 'wchar_t' can no longer be specified\n"
                  "with option '--buffer-element-size' or '-bes'. Please, specify\n"
                  "'--buffer-element-type wchar_t' or '--bet'.")

    if Setup.buffer_element_type == "wchar_t":
        Setup.converter_ucs_coding_name = "WCHAR_T"

    # (*) Determine buffer element type and size (in bytes)
    if Setup.buffer_element_size == -1:
        if global_character_type_db.has_key(Setup.buffer_element_type):
            Setup.buffer_element_size = global_character_type_db[Setup.buffer_element_type][3]
        elif Setup.buffer_element_type == "":
            Setup.buffer_element_size = 1
        else:
            # Buffer element type is not identified in 'global_character_type_db'.
            # => here Quex cannot know its size on its own.
            Setup.buffer_element_size = -1

    if Setup.buffer_element_type == "":
        if Setup.buffer_element_size in [1, 2, 4]:
            Setup.buffer_element_type = { 
                1: "uint8_t", 2: "uint16_t", 4: "uint32_t",
            }[Setup.buffer_element_size]
        elif Setup.buffer_element_size == -1:
            pass
        else:
            error_msg("Buffer element type cannot be determined for size '%i' which\n" \
                      % Setup.buffer_element_size + 
                      "has been specified by '-b' or '--buffer-element-size'.")

    type_info = global_character_type_db.get(Setup.buffer_element_type)
    if     type_info is not None and len(type_info) >= 4 \
       and type_info[3] != -1 and Setup.buffer_element_size != -1 \
       and type_info[3] != Setup.buffer_element_size:
        error_msg("\nBuffer element type ('--bet' or '--buffer-element-type') was set to '%s'.\n" \
                  % Setup.buffer_element_type \
                  + "It is well known to be of size %s[byte]. However, the buffer element size\n" \
                  % type_info[3] \
                  + "('-b' or '--buffer-element-type') was specified as '%s'.\n\n" \
                  % Setup.buffer_element_size \
                  + "Quex can continue, but the result is questionable.\n", \
                  DontExitF=True)

    Setup.converter_f = False
    if Setup.converter_iconv_f or Setup.converter_icu_f or len(Setup.converter_user_new_func) != 0:
        Setup.converter_f = True

    # The only case where no converter helper is required is where ASCII 
    # (Unicode restricted to [0, FF] is used.
    Setup.converter_helper_required_f = True
    if Setup.converter_f == False and Setup.buffer_element_size == 1 and Setup.buffer_codec.name == "unicode":
        Setup.converter_helper_required_f = False

    validation.do(Setup, command_line, argv)

    if Setup.converter_ucs_coding_name == "": 
        if global_character_type_db.has_key(Setup.buffer_element_type):
            if Setup.buffer_byte_order == "little": index = 1
            else:                                   index = 2
            Setup.converter_ucs_coding_name = global_character_type_db[Setup.buffer_element_type][index]

    if len(Setup.token_id_foreign_definition) != 0: 
        if len(Setup.token_id_foreign_definition) > 3: 
            error_msg("Option '--foreign-token-id-file' received > 3 followers.\n"
                      "Found: %s" % str(Setup.token_id_foreign_definition)[1:-1])
        if len(Setup.token_id_foreign_definition) > 1:
            Setup.token_id_foreign_definition_file_region_begin_re = \
                    __compile_regular_expression(Setup.token_id_foreign_definition[1], "token id region begin")
        if len(Setup.token_id_foreign_definition) > 2:
            Setup.token_id_foreign_definition_file_region_end_re = \
                    __compile_regular_expression(Setup.token_id_foreign_definition[2], "token id region end")
        Setup.token_id_foreign_definition_file = \
                Setup.token_id_foreign_definition[0]

        CommentDelimiterList = [["//", "\n"], ["/*", "*/"]]
        token_id_file_parse(Setup.token_id_foreign_definition_file, 
                            CommentDelimiterList)

    # (*) Compression Types
    compression_type_list = []
    for name, ctype in [("compression_template_f",         E_Compression.TEMPLATE),
                        ("compression_template_uniform_f", E_Compression.TEMPLATE_UNIFORM),
                        ("compression_path_f",             E_Compression.PATH),
                        ("compression_path_uniform_f",     E_Compression.PATH_UNIFORM)]:
        if command_line_args_defined(command_line, name):
            compression_type_list.append((command_line_arg_position(name), ctype))

    compression_type_list.sort(key=itemgetter(0))
    Setup.compression_type_list = map(lambda x: x[1], compression_type_list)

    # (*) return Setup ___________________________________________________________________
    return True
Exemple #37
0
def do(setup, command_line, argv):
    """Does a consistency check for setup and the command line.
    """

    setup.output_directory = os.path.normpath(setup.output_directory)
    if setup.output_directory != "":
        # Check, if the output directory exists
        if os.access(setup.output_directory, os.F_OK) == False:
            error_msg("The directory %s was specified for output, but does not exists." % setup.output_directory)
        if os.access(setup.output_directory, os.W_OK) == False:
            error_msg("The directory %s was specified for output, but is not writeable." % setup.output_directory)

    # if the mode is '--language dot' => check character display options. 
    if setup.character_display not in ["hex", "utf8"]:
        error_msg("Character display must be either 'hex' or 'utf8'.\nFound: '%s'" % 
                  setup.character_display)

    # ensure that options are not specified twice
    for parameter, info in SETUP_INFO.items():
        if type(info) != list: continue
        occurence_n = 0 
        for option in info[0]:
            occurence_n += argv.count(option)
        if occurence_n > 1:
            error_msg("Received more than one of the following options:\n" + \
                      "%s" % repr(info[0])[1:-1])

    # (*) Check for 'Depraceted' Options ___________________________________________________
    for name, info in DEPRECATED.items():
        command_line_options = SETUP_INFO[name][0]
        comment                   = info[0]
        depreciated_since_version = info[1]
        for option in command_line_options:
            if command_line.search(option):
                error_msg("Command line option '%s' is ignored.\n" % option + \
                          comment + "\n" + \
                          "Last version of Quex supporting this option is version %s. Please, visit\n" % \
                          depreciated_since_version + \
                          "http://quex.org for further information.")
                          
    # (*) Check for 'Straying' Options ___________________________________________________
    options = []
    for key, info in SETUP_INFO.items():
        if type(info) != list: continue
        if key in DEPRECATED: continue
        if info[1] is not None: options.extend(info[0])
    options.sort(lambda a,b: cmp(a.replace("-",""), b.replace("-","")))

    ufos = command_line.unidentified_options(options)
    if len(ufos) != 0:
        error_msg("Unidentified option(s) = " +  repr(ufos) + "\n" + \
                  __get_supported_command_line_option_description(options))

    if setup.analyzer_derived_class_name != "" and \
       setup.analyzer_derived_class_file == "":
            error_msg("Specified derived class '%s' on command line, but it was not\n" % \
                      setup.analyzer_derived_class_name + \
                      "specified which file contains the definition of it.\n" + \
                      "use command line option '--derived-class-file'.\n")

    if setup.buffer_element_size not in [-1, 1, 2, 4]:
        error_msg("The setting of '--buffer-element-size' (or '-b') can only be\n" 
                  "1, 2, or 4 (found %s)." % repr(setup.buffer_element_size))

    if setup.buffer_byte_order not in ["<system>", "little", "big"]:
        error_msg("Byte order (option --endian) must be 'little', 'big', or '<system>'.\n" + \
                  "Note, that this option is only interesting for cross plattform development.\n" + \
                  "By default, quex automatically chooses the endian type of your system.")

    # Manually written token class requires token class name to be specified
    if setup.token_class_file != "" and command_line.search("--token-class", "--tc") == False:
        error_msg("The use of a manually written token class requires that the name of the class\n"
                  "is specified on the command line via the '--token-class' option.")
    
    # Token queue
    if setup.token_policy != "queue" and command_line.search("--token-queue-size"):
        error_msg("Option --token-queue-size determines a fixed token queue size. This makes\n" + \
                  "only sense in conjunction with '--token-policy queue'.\n")
    if setup.token_queue_size <= setup.token_queue_safety_border + 1:
        if setup.token_queue_size == setup.token_queue_safety_border: cmp_str = "equal to"
        else:                                                         cmp_str = "less than"
        error_msg("Token queue size is %i is %s token queue safety border %i + 1.\n" % \
                  (setup.token_queue_size, cmp_str, setup.token_queue_safety_border) + 
                  "Set appropriate values with --token-queue-size and --token-queue-safety-border.")

    # Check that names are valid identifiers
    __check_identifier(setup, "token_id_prefix_plain",    "Token prefix")
    __check_identifier(setup, "analyzer_class_name", "Engine name")
    if setup.analyzer_derived_class_name != "": 
        __check_identifier(setup, "analyzer_derived_class_name", "Derived class name")
    
    __check_file_name(setup, "token_class_file",            "file containing token class definition")
    __check_file_name(setup, "analyzer_derived_class_file", "file containing user derived lexer class")
    __check_file_name(setup, "token_id_foreign_definition_file", "file containing user token ids")
    __check_file_name(setup, "input_mode_files", "quex source file")

    # Check that not more than one converter is specified
    converter_n = 0
    if setup.converter_iconv_f:             converter_n += 1
    if setup.converter_icu_f:               converter_n += 1 
    if setup.converter_user_new_func != "": converter_n += 1
    if converter_n > 1:
        error_msg("More than one character converter has been specified. Note, that the\n" + \
                  "options '--icu', '--iconv', and '--converter-new' (or '--cn') are\n"    + \
                  "to be used mutually exclusively.")
    if converter_n == 1 and setup.buffer_codec != "unicode":  
        # If the buffer codec is other than unicode, then no converter shall
        # be used to fill the buffer. Instead, the engine is transformed, so 
        # that it works directly on the codec.
        error_msg("An engine that is to be generated for a specific codec cannot rely\n"      + \
                  "on converters. Do no use '--codec' together with '--icu', '--iconv', or\n" + \
                  "`--converter-new`.")

    # If a converter has been specified and no bytes-element-size has been specified,
    # it defaults to '1 byte' which is most likely not what is desired for unicode.
    if     converter_n == 1 \
       and setup.buffer_element_size == 1 \
       and not command_line_args_defined(command_line, "buffer_element_size") \
       and not command_line_args_defined(command_line, "buffer_element_type"):
        error_msg("A converter has been specified, but the default buffer element size\n" + \
                  "is left to 1 byte. Consider %s or %s." \
                  % (command_line_args_string("buffer_element_size"),
                     command_line_args_string("buffer_element_type")))

    # If a user defined type is specified for 'engine character type' and 
    # a converter, then the name of the target type must be specified explicitly.
    if         setup.buffer_element_type != "" \
       and not global_character_type_db.has_key(setup.buffer_element_type) \
       and     setup.converter_ucs_coding_name == "" \
       and     converter_n != 0:
        tc = setup.buffer_element_type
        error_msg("A character code converter has been specified. It is supposed to convert\n" + \
                  "incoming data into an internal buffer of unicode characters. The size of\n" + \
                  "each character is determined by '%s' which is a user defined type.\n" % tc  + \
                  "\n" + \
                  "Quex cannot determine automatically the name that the converter requires\n" +      \
                  "to produce unicode characters for type '%s'. It must be specified by the\n" % tc + \
                  "command line option %s." \
                  % command_line_args_string("converter_ucs_coding_name"))

    # Token transmission policy
    token_policy_list = ["queue", "single", "users_token", "users_queue"]
    if setup.token_policy not in token_policy_list:
        error_msg("Token policy '%s' not supported. Use one of the following:\n" % setup.token_policy + \
                  repr(token_policy_list)[1:-1])
    elif setup.token_policy == "users_token":
        error_msg("Token policy 'users_queue' has be deprecated since 0.49.1. Use\n"
                  "equivalent policy 'single'.")
    elif setup.token_policy == "users_queue":
        error_msg("Token policy 'users_queue' has be deprecated since 0.49.1\n")

    # Internal engine character encoding
    def __codec_vs_buffer_element_size(CodecName, RequiredBufferElementSize):
        if   setup.buffer_codec        != CodecName:                 return
        elif setup.buffer_element_size == RequiredBufferElementSize: return

        if setup.buffer_element_size == -1: 
            msg_str = "undetermined (found type '%s')" % setup.buffer_element_type
        else:
            msg_str = "is not %i (found %i)" % (RequiredBufferElementSize, setup.buffer_element_size)

        error_msg("Using codec '%s' while buffer element size %s.\n" % (CodecName, msg_str) + 
                  "Consult command line argument %s" \
                  % command_line_args_string("buffer_element_size"))

    if setup.buffer_codec != "unicode":
        if setup.buffer_codec_file == "":
            verify_word_in_list(setup.buffer_codec,
                                codec_db.get_supported_codec_list() + ["utf8", "utf16"],
                                "Codec '%s' is not supported." % setup.buffer_codec)
        __codec_vs_buffer_element_size("utf8", 1)
        __codec_vs_buffer_element_size("utf16", 2)

    if setup.external_lexeme_null_object and setup.token_class_only_f:
        error_msg("Specifying an external lexeme null object signalizes an\n"
                  "external token class implementation. The 'token class only\n"
                  "flag' generates a token class considered to be externally\n"
                  "shared. Both flags are mutually exclusive.")
Exemple #38
0
def __parse_option(fh, new_mode):
    def get_pattern_object(SM):
        if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM)
        else:                         result = SM
        result = hopcroft.do(result, CreateNewStateMachineF=False)
        return Pattern(result, AllowStateMachineTrafoF=True)

    identifier = read_option_start(fh)
    if identifier is None: return False

    verify_word_in_list(identifier, mode_option_info_db.keys(),
                        "mode option", fh.name, get_current_line_info_number(fh))

    if identifier == "skip":
        # A skipper 'eats' characters at the beginning of a pattern that belong
        # to a specified set of characters. A useful application is most probably
        # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to
        # implement a very effective way to skip these regions.
        pattern_str, trigger_set = regular_expression.parse_character_set(fh, PatternStringF=True)
        skip_whitespace(fh)

        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'." % identifier, fh)

        if trigger_set.is_empty():
            error_msg("Empty trigger set for skipper." % identifier, fh)

        # TriggerSet skipping is implemented the following way: As soon as one element of the 
        # trigger set appears, the state machine enters the 'trigger set skipper section'.
        # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action.
        # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)'
        pattern_sm  = StateMachine()
        pattern_sm.add_transition(pattern_sm.init_state_index, trigger_set, AcceptanceF=True)

        # Skipper code is to be generated later
        action = GeneratedCode(skip_character_set.do, 
                               FileName = fh.name, 
                               LineN    = get_current_line_info_number(fh))
        action.data["character_set"] = trigger_set

        new_mode.add_match(pattern_str, action, get_pattern_object(pattern_sm), 
                           Comment=E_SpecialPatterns.SKIP)

        return True

    elif identifier in ["skip_range", "skip_nested_range"]:
        # A non-nesting skipper can contain a full fledged regular expression as opener,
        # since it only effects the trigger. Not so the nested range skipper-see below.

        # -- opener
        skip_whitespace(fh)
        if identifier == "skip_nested_range":
            # Nested range state machines only accept 'strings' not state machines
            opener_str, opener_sequence = __parse_string(fh, "Opener pattern for 'skip_nested_range'")
            opener_sm = StateMachine.from_sequence(opener_sequence)
        else:
            opener_str, opener_pattern = regular_expression.parse(fh)
            opener_sm = opener_pattern.sm
            # For 'range skipping' the opener sequence is not needed, only the opener state
            # machine is webbed into the pattern matching state machine.
            opener_sequence       = None

        skip_whitespace(fh)

        # -- closer
        closer_str, closer_sequence = __parse_string(fh, "Closing pattern for 'skip_range' or 'skip_nested_range'")
        skip_whitespace(fh)
        if fh.read(1) != ">":
            error_msg("missing closing '>' for mode option '%s'" % identifier, fh)

        # Skipper code is to be generated later
        generator_function, comment = { 
                "skip_range":        (skip_range.do,        E_SpecialPatterns.SKIP_RANGE),
                "skip_nested_range": (skip_nested_range.do, E_SpecialPatterns.SKIP_NESTED_RANGE),
        }[identifier]
        action = GeneratedCode(generator_function,
                               FileName = fh.name, 
                               LineN    = get_current_line_info_number(fh))

        action.data["opener_sequence"] = opener_sequence
        action.data["closer_sequence"] = closer_sequence
        action.data["mode_name"]       = new_mode.name

        new_mode.add_match(opener_str, action, get_pattern_object(opener_sm), Comment=comment)

        return True
        
    elif identifier == "indentation":
        value = indentation_setup.do(fh)

        # Enter 'Newline' and 'Suppressed Newline' as matches into the engine.
        # Similar to skippers, the indentation count is then triggered by the newline.
        # -- Suppressed Newline = Suppressor followed by Newline,
        #    then newline does not trigger indentation counting.
        suppressed_newline_pattern_str = ""
        if value.newline_suppressor_state_machine.get() is not None:
            suppressed_newline_pattern_str = \
                  "(" + value.newline_suppressor_state_machine.pattern_string() + ")" \
                + "(" + value.newline_state_machine.pattern_string() + ")"
                                           
            suppressed_newline_sm = \
                sequentialize.do([value.newline_suppressor_state_machine.get(),
                                  value.newline_state_machine.get()])
                 
            FileName = value.newline_suppressor_state_machine.file_name
            LineN    = value.newline_suppressor_state_machine.line_n
            # Go back to start.
            code = UserCodeFragment("goto %s;" % get_label("$start", U=True), FileName, LineN)

            new_mode.add_match(suppressed_newline_pattern_str, code, 
                               get_pattern_object(suppressed_newline_sm),
                               Comment=E_SpecialPatterns.SUPPRESSED_INDENTATION_NEWLINE)

        # When there is an empty line, then there shall be no indentation count on it.
        # Here comes the trick: 
        #
        #      Let               newline         
        #      be defined as:    newline ([space]* newline])*
        # 
        # This way empty lines are eating away before the indentation count is activated.

        # -- 'space'
        x0 = StateMachine()
        x0.add_transition(x0.init_state_index, value.indentation_count_character_set(), 
                          AcceptanceF=True)
        # -- '[space]*'
        x1 = repeat.do(x0)
        # -- '[space]* newline'
        x2 = sequentialize.do([x1, value.newline_state_machine.get()])
        # -- '([space]* newline)*'
        x3 = repeat.do(x2)
        # -- 'newline ([space]* newline)*'
        x4 = sequentialize.do([value.newline_state_machine.get(), x3])
        # -- nfa to dfa; hopcroft optimization
        sm = beautifier.do(x4)

        FileName = value.newline_state_machine.file_name
        LineN    = value.newline_state_machine.line_n
        action   = GeneratedCode(indentation_counter.do, FileName, LineN)

        action.data["indentation_setup"] = value

        new_mode.add_match(value.newline_state_machine.pattern_string(), action, 
                           get_pattern_object(sm), 
                           Comment=E_SpecialPatterns.INDENTATION_NEWLINE)

        # Announce the mode to which the setup belongs
        value.set_containing_mode_name(new_mode.name)
    else:
        value = read_option_value(fh)

    # The 'verify_word_in_list()' call must have ensured that the following holds
    assert mode_option_info_db.has_key(identifier)

    # Is the option of the appropriate value?
    option_info = mode_option_info_db[identifier]
    if option_info.domain is not None and value not in option_info.domain:
        error_msg("Tried to set value '%s' for option '%s'. " % (value, identifier) + \
                  "Though, possible for this option are only: %s." % repr(option_info.domain)[1:-1], fh)

    # Finally, set the option
    new_mode.add_option(identifier, value)

    return True
Exemple #39
0
def do(setup, command_line, argv):
    """Does a consistency check for setup and the command line.
    """

    setup.output_directory = os.path.normpath(setup.output_directory)
    if setup.output_directory:
        # Check, if the output directory exists
        if os.access(setup.output_directory, os.F_OK) == False:
            error_msg(
                "The directory %s was specified for output, but does not exists."
                % setup.output_directory)
        if os.access(setup.output_directory, os.W_OK) == False:
            error_msg(
                "The directory %s was specified for output, but is not writeable."
                % setup.output_directory)

    # if the mode is '--language dot' => check character display options.
    if setup.character_display not in ["hex", "utf8"]:
        error_msg(
            "Character display must be either 'hex' or 'utf8'.\nFound: '%s'" %
            setup.character_display)

    # ensure that options are not specified twice
    for parameter, info in SETUP_INFO.items():
        if type(info) != list: continue
        occurence_n = 0
        for option in info[0]:
            occurence_n += argv.count(option)
        if occurence_n > 1 and info[1] not in (SetupParTypes.LIST,
                                               SetupParTypes.INT_LIST):
            error_msg("Received more than one of the following options:\n" + \
                      "%s" % repr(info[0])[1:-1])

    # (*) Check for 'Depraceted' Options ___________________________________________________
    for name, info in DEPRECATED.items():
        command_line_options = SETUP_INFO[name][0]
        comment = info[0]
        depreciated_since_version = info[1]
        for option in command_line_options:
            if command_line.search(option):
                error_msg("Command line option '%s' is ignored.\n" % option + \
                          comment + "\n" + \
                          "Last version of Quex supporting this option is version %s. Please, visit\n" % \
                          depreciated_since_version + \
                          "http://quex.org for further information.")

    # (*) Check for 'Straying' Options ___________________________________________________
    options = []
    for key, info in SETUP_INFO.items():
        if type(info) != list: continue
        if key in DEPRECATED: continue
        if info[1] is not None: options.extend(info[0])
    options.sort(lambda a, b: cmp(a.replace("-", ""), b.replace("-", "")))

    ufos = command_line.unidentified_options(options)
    if len(ufos) != 0:
        error_msg("Unidentified option(s) = " +  repr(ufos) + "\n" + \
                  __get_supported_command_line_option_description(options))

    if setup.analyzer_derived_class_name != "" and \
       setup.analyzer_derived_class_file == "":
        error_msg("Specified derived class '%s' on command line, but it was not\n" % \
                  setup.analyzer_derived_class_name + \
                  "specified which file contains the definition of it.\n" + \
                  "use command line option '--derived-class-file'.\n")

    if setup.buffer_element_size not in [-1, 1, 2, 4]:
        error_msg(
            "The setting of '--buffer-element-size' (or '-b') can only be\n"
            "1, 2, or 4 (found %s)." % repr(setup.buffer_element_size))

    if setup.buffer_byte_order not in ["<system>", "little", "big"]:
        error_msg("Byte order (option --endian) must be 'little', 'big', or '<system>'.\n" + \
                  "Note, that this option is only interesting for cross plattform development.\n" + \
                  "By default, quex automatically chooses the endian type of your system.")

    # Manually written token class requires token class name to be specified
    if setup.token_class_file != "" and command_line.search(
            "--token-class", "--tc") == False:
        error_msg(
            "The use of a manually written token class requires that the name of the class\n"
            "is specified on the command line via the '--token-class' option.")

    # Token queue
    if setup.token_policy != "queue" and command_line.search(
            "--token-queue-size"):
        error_msg("Option --token-queue-size determines a fixed token queue size. This makes\n" + \
                  "only sense in conjunction with '--token-policy queue'.\n")
    if setup.token_queue_size <= setup.token_queue_safety_border + 1:
        if setup.token_queue_size == setup.token_queue_safety_border:
            cmp_str = "equal to"
        else:
            cmp_str = "less than"
        error_msg("Token queue size is %i is %s token queue safety border %i + 1.\n" % \
                  (setup.token_queue_size, cmp_str, setup.token_queue_safety_border) +
                  "Set appropriate values with --token-queue-size and --token-queue-safety-border.")

    # Check that names are valid identifiers
    if len(setup.token_id_prefix_plain) != 0:
        __check_identifier(setup, "token_id_prefix_plain", "Token prefix")
    __check_identifier(setup, "analyzer_class_name", "Engine name")
    if setup.analyzer_derived_class_name != "":
        __check_identifier(setup, "analyzer_derived_class_name",
                           "Derived class name")

    __check_file_name(setup, "token_class_file",
                      "file containing token class definition")
    __check_file_name(setup, "analyzer_derived_class_file",
                      "file containing user derived lexer class")
    __check_file_name(
        setup,
        "token_id_foreign_definition_file",
        "file containing user token ids",
        0,
        CommandLineOption=SETUP_INFO["token_id_foreign_definition"][0])
    __check_file_name(setup, "input_mode_files", "quex source file")

    # Check that not more than one converter is specified
    converter_n = 0
    if setup.converter_iconv_f: converter_n += 1
    if setup.converter_icu_f: converter_n += 1
    if len(setup.converter_user_new_func) != 0: converter_n += 1
    if converter_n > 1:
        error_msg("More than one character converter has been specified. Note, that the\n" + \
                  "options '--icu', '--iconv', and '--converter-new' (or '--cn') are\n"    + \
                  "to be used mutually exclusively.")
    if converter_n == 1 and setup.buffer_codec.name != "unicode":
        # If the buffer codec is other than unicode, then no converter shall
        # be used to fill the buffer. Instead, the engine is transformed, so
        # that it works directly on the codec.
        error_msg("An engine that is to be generated for a specific codec cannot rely\n"      + \
                  "on converters. Do no use '--codec' together with '--icu', '--iconv', or\n" + \
                  "`--converter-new`.")

    # If a converter has been specified and no bytes-element-size has been specified,
    # it defaults to '1 byte' which is most likely not what is desired for unicode.
    if     converter_n == 1 \
       and setup.buffer_element_size == 1 \
       and not command_line_args_defined(command_line, "buffer_element_size") \
       and not command_line_args_defined(command_line, "buffer_element_type"):
        error_msg("A converter has been specified, but the default buffer element size\n" + \
                  "is left to 1 byte. Consider %s or %s." \
                  % (command_line_args_string("buffer_element_size"),
                     command_line_args_string("buffer_element_type")))

    # If a user defined type is specified for 'engine character type' and
    # a converter, then the name of the target type must be specified explicitly.
    if         setup.buffer_element_type != "" \
       and not global_character_type_db.has_key(setup.buffer_element_type) \
       and     setup.converter_ucs_coding_name == "" \
       and     converter_n != 0:
        tc = setup.buffer_element_type
        error_msg("A character code converter has been specified. It is supposed to convert\n" + \
                  "incoming data into an internal buffer of unicode characters. The size of\n" + \
                  "each character is determined by '%s' which is a user defined type.\n" % tc  + \
                  "\n" + \
                  "Quex cannot determine automatically the name that the converter requires\n" +      \
                  "to produce unicode characters for type '%s'. It must be specified by the\n" % tc + \
                  "command line option %s." \
                  % command_line_args_string("converter_ucs_coding_name"))

    # Token transmission policy
    token_policy_list = ["queue", "single", "users_token", "users_queue"]
    if setup.token_policy not in token_policy_list:
        error_msg("Token policy '%s' not supported. Use one of the following:\n" % setup.token_policy + \
                  repr(token_policy_list)[1:-1])
    elif setup.token_policy == "users_token":
        error_msg(
            "Token policy 'users_queue' has be deprecated since 0.49.1. Use\n"
            "equivalent policy 'single'.")
    elif setup.token_policy == "users_queue":
        error_msg(
            "Token policy 'users_queue' has be deprecated since 0.49.1\n")

    # Internal engine character encoding
    def __codec_vs_buffer_element_size(CodecName, RequiredBufferElementSize):
        if setup.buffer_codec.name != CodecName: return
        elif setup.buffer_element_size == RequiredBufferElementSize: return

        if setup.buffer_element_size == -1:
            msg_str = "undetermined (found type '%s')" % setup.buffer_element_type
        else:
            msg_str = "is not %i (found %i)" % (RequiredBufferElementSize,
                                                setup.buffer_element_size)

        error_msg("Using codec '%s' while buffer element size %s.\n" % (CodecName, msg_str) +
                  "Consult command line argument %s" \
                  % command_line_args_string("buffer_element_size"))

    if setup.buffer_codec.name != "unicode":
        if not setup.buffer_codec_file:
            verify_word_in_list(
                setup.buffer_codec_name,
                codec_db.get_supported_codec_list() + ["utf8", "utf16"],
                "Codec '%s' is not supported." % setup.buffer_codec.name)
        __codec_vs_buffer_element_size("utf8", 1)
        __codec_vs_buffer_element_size("utf16", 2)

    if setup.external_lexeme_null_object and setup.token_class_only_f:
        error_msg(
            "Specifying an external lexeme null object signalizes an\n"
            "external token class implementation. The 'token class only\n"
            "flag' generates a token class considered to be externally\n"
            "shared. Both flags are mutually exclusive.")

    if setup.string_accumulator_f:
        error_n = NotificationDB.warning_on_no_token_class_take_text
        if error_n in setup.suppressed_notification_list:
            error_msg(
                "The warning upon missing 'take_text' in token type definition is de-\n"
                +
                "activated by '--suppress %i'. This is dangerous, if there is a string\n"
                % error_n +
                "accumulator. May be, use '--no-string-accumulator'.",
                DontExitF=True,
                WarningF=True,
                SuppressCode=NotificationDB.
                warning_on_no_warning_on_missing_take_text)
Exemple #40
0
def __parse_section(fh, descriptor, already_defined_list):
    global token_type_code_fragment_db
    assert type(already_defined_list) == list

    SubsectionList = ["name", "file_name", "standard", "distinct", "union", "inheritable", "noid"] \
                      + token_type_code_fragment_db.keys()

    position = fh.tell()
    skip_whitespace(fh)
    word = read_identifier(fh)
    if word == "":
        fh.seek(position)
        if check(fh, "}"):
            fh.seek(position)
            return False
        error_msg(
            "Missing token_type section ('standard', 'distinct', or 'union').",
            fh)

    verify_word_in_list(
        word, SubsectionList,
        "Subsection '%s' not allowed in token_type section." % word, fh)

    if word == "name":
        if not check(fh, "="):
            error_msg("Missing '=' in token_type 'name' specification.", fh)
        descriptor.class_name, descriptor.name_space, descriptor.class_name_safe = read_namespaced_name(
            fh, "token_type")
        if not check(fh, ";"):
            error_msg(
                "Missing terminating ';' in token_type 'name' specification.",
                fh)

    elif word == "inheritable":
        descriptor.open_for_derivation_f = True
        check_or_die(fh, ";")

    elif word == "noid":
        descriptor.token_contains_token_id_f = False
        check_or_die(fh, ";")

    elif word == "file_name":
        if not check(fh, "="):
            error_msg("Missing '=' in token_type 'file_name' specification.",
                      fh)
        descriptor.set_file_name(read_until_letter(fh, ";"))
        if not check(fh, ";"):
            error_msg(
                "Missing terminating ';' in token_type 'file_name' specification.",
                fh)

    elif word in ["standard", "distinct", "union"]:
        if word == "standard":
            parse_standard_members(fh, word, descriptor, already_defined_list)
        elif word == "distinct":
            parse_distinct_members(fh, word, descriptor, already_defined_list)
        elif word == "union":
            parse_union_members(fh, word, descriptor, already_defined_list)

        if not check(fh, "}"):
            fh.seek(position)
            error_msg(
                "Missing closing '}' at end of token_type section '%s'." %
                word, fh)

    elif word in token_type_code_fragment_db.keys():
        fragment = code_fragment.parse(fh, word, AllowBriefTokenSenderF=False)
        descriptor.__dict__[word] = fragment

    else:
        assert False, "This code section section should not be reachable because 'word'\n" + \
                      "was checked to fit in one of the 'elif' cases."

    return True
Exemple #41
0
def __perform_setup(command_line, argv):
    """RETURN:  True, if process needs to be started.
                False, if job is done.
    """
    global setup

    # (*) Classes and their namespace
    __setup_analyzer_class(setup)
    __setup_token_class(setup)
    __setup_token_id_prefix(setup)
    __setup_lexeme_null(setup)       # Requires 'token_class_name_space'

    # (*) Output programming language        
    setup.language = setup.language.upper()
    verify_word_in_list(setup.language,
                        quex_core_engine_generator_languages_db.keys(),
                        "Programming language '%s' is not supported." % setup.language)
    setup.language_db  = quex_core_engine_generator_languages_db[setup.language]
    setup.extension_db = global_extension_db[setup.language]

    # Is the output file naming scheme provided by the extension database
    # (Validation must happen immediately)
    if setup.extension_db.has_key(setup.output_file_naming_scheme) == False:
        error_msg("File extension scheme '%s' is not provided for language '%s'.\n" \
                  % (setup.output_file_naming_scheme, setup.language) + \
                  "Available schemes are: %s." % repr(setup.extension_db.keys())[1:-1])

    # Before file names can be prepared, determine the output directory
    # If 'source packaging' is enabled and no output directory is specified
    # then take the directory of the source packaging.
    if setup.source_package_directory != "" and setup.output_directory == "":
        setup.output_directory = setup.source_package_directory

    if setup.buffer_codec in ["utf8", "utf16"]:
        setup.buffer_codec_transformation_info = setup.buffer_codec + "-state-split"

    elif setup.buffer_codec_file != "":
        try: 
            setup.buffer_codec = os.path.splitext(os.path.basename(setup.buffer_codec_file))[0]
        except:
            error_msg("cannot interpret string following '--codec-file'")

        setup.buffer_codec_transformation_info = codec_db.get_codec_transformation_info(FileName=setup.buffer_codec_file)

    elif setup.buffer_codec != "unicode":
        setup.buffer_codec_transformation_info = codec_db.get_codec_transformation_info(setup.buffer_codec)

    if setup.buffer_codec != "unicode":
        setup.buffer_element_size_irrelevant = True
    
    # (*) Output files
    if setup.language not in ["DOT"]:
        prepare_file_names(setup)

    if setup.buffer_byte_order == "<system>": 
        setup.buffer_byte_order = sys.byteorder 
        setup.byte_order_is_that_of_current_system_f = True
    else:
        setup.byte_order_is_that_of_current_system_f = False

    if setup.buffer_element_size == "wchar_t":
        error_msg("Since Quex version 0.53.5, 'wchar_t' can no longer be specified\n"
                  "with option '--buffer-element-size' or '-bes'. Please, specify\n"
                  "'--buffer-element-type wchar_t' or '--bet'.")

    if setup.buffer_element_type == "wchar_t":
        setup.converter_ucs_coding_name = "WCHAR_T"

    make_numbers(setup)

    # (*) Determine buffer element type and size (in bytes)
    if setup.buffer_element_size == -1:
        if global_character_type_db.has_key(setup.buffer_element_type):
            setup.buffer_element_size = global_character_type_db[setup.buffer_element_type][3]
        elif setup.buffer_element_type == "":
            setup.buffer_element_size = 1
        else:
            # If the buffer element type is defined, then here we know that it is 'unknown'
            # and Quex cannot know its size on its own.
            setup.buffer_element_size = -1

    if setup.buffer_element_type == "":
        if setup.buffer_element_size in [1, 2, 4]:
            setup.buffer_element_type = { 
                1: "uint8_t", 2: "uint16_t", 4: "uint32_t",
            }[setup.buffer_element_size]
        elif setup.buffer_element_size == -1:
            pass
        else:
            error_msg("Buffer element type cannot be determined for size '%i' which\n" \
                      % setup.buffer_element_size + 
                      "has been specified by '-b' or '--buffer-element-size'.")

    setup.converter_f = False
    if setup.converter_iconv_f or setup.converter_icu_f:
        setup.converter_f = True

    # The only case where no converter helper is required is where ASCII 
    # (Unicode restricted to [0, FF] is used.
    setup.converter_helper_required_f = True
    if setup.converter_f == False and setup.buffer_element_size == 1 and setup.buffer_codec == "unicode":
        setup.converter_helper_required_f = False

    validation.do(setup, command_line, argv)

    if setup.converter_ucs_coding_name == "": 
        if global_character_type_db.has_key(setup.buffer_element_type):
            if setup.buffer_byte_order == "little": index = 1
            else:                                   index = 2
            setup.converter_ucs_coding_name = global_character_type_db[setup.buffer_element_type][index]

    if setup.token_id_foreign_definition_file != "": 
        CommentDelimiterList = [["//", "\n"], ["/*", "*/"]]
        # Regular expression to find '#include <something>' and extract the 'something'
        # in a 'group'. Note that '(' ')' cause the storage of parts of the match.
        IncludeRE            = "#[ \t]*include[ \t]*[\"<]([^\">]+)[\">]"
        #
        parse_token_id_file(setup.token_id_foreign_definition_file, 
                            setup.token_id_prefix, 
                            CommentDelimiterList, IncludeRE)
        if setup.token_id_prefix_plain != setup.token_id_prefix:
            # The 'plain' name space less token indices are also supported
            parse_token_id_file(setup.token_id_foreign_definition_file, 
                                setup.token_id_prefix_plain, 
                                CommentDelimiterList, IncludeRE)

    # (*) Compression Types
    compression_type_list = []
    for name, ctype in [("compression_template_f",         E_Compression.TEMPLATE),
                        ("compression_template_uniform_f", E_Compression.TEMPLATE_UNIFORM),
                        ("compression_path_f",             E_Compression.PATH),
                        ("compression_path_uniform_f",     E_Compression.PATH_UNIFORM)]:
        if command_line_args_defined(command_line, name):
            compression_type_list.append((command_line_arg_position(name), ctype))
    compression_type_list.sort(key=itemgetter(0))
    setup.compression_type_list = map(lambda x: x[1], compression_type_list)

    # (*) return setup ___________________________________________________________________
    return True
Exemple #42
0
def do(fh):
    """Parses pattern definitions of the form:
   
          [ \t]                                       => grid 4;
          [:intersection([:alpha:], [\X064-\X066]):]  => space 1;

       In other words the right hand side *must* be a character set.
          
    """
    indentation_setup = IndentationSetup(fh)

    # NOTE: Catching of EOF happens in caller: parse_section(...)
    #
    skip_whitespace(fh)

    while 1 + 1 == 2:
        skip_whitespace(fh)

        if check(fh, ">"):
            indentation_setup.seal()
            indentation_setup.consistency_check(fh)
            return indentation_setup

        # A regular expression state machine
        pattern_str, pattern = regular_expression.parse(fh)

        skip_whitespace(fh)
        if not check(fh, "=>"):
            error_msg("Missing '=>' after character set definition.", fh)

        skip_whitespace(fh)
        identifier = read_identifier(fh)
        if identifier == "":
            error_msg("Missing identifier for indentation element definition.",
                      fh)

        verify_word_in_list(
            identifier, ["space", "grid", "bad", "newline", "suppressor"],
            "Unrecognized indentation specifier '%s'." % identifier, fh)

        trigger_set = None
        if identifier in ["space", "bad", "grid"]:
            if len(pattern.sm.states) != 2:
                error_msg("For indentation '%s' only patterns are addmissible which\n" % identifier + \
                          "can be matched by a single character, e.g. \" \" or [a-z].", fh)
            transition_map = pattern.sm.get_init_state().transitions().get_map(
            )
            assert len(transition_map) == 1
            trigger_set = transition_map.values()[0]

        skip_whitespace(fh)
        if identifier == "space":
            value = read_integer(fh)
            if value is not None:
                indentation_setup.specify_space(pattern_str, trigger_set,
                                                value, fh)
            else:
                # not a number received, is it an identifier?
                variable = read_identifier(fh)
                if variable != "":
                    indentation_setup.specify_space(pattern_str, trigger_set,
                                                    variable, fh)
                else:
                    indentation_setup.specify_space(pattern_str, trigger_set,
                                                    1, fh)

        elif identifier == "grid":
            value = read_integer(fh)
            if value is not None:
                indentation_setup.specify_grid(pattern_str, trigger_set, value,
                                               fh)
            else:
                # not a number received, is it an identifier?
                skip_whitespace(fh)
                variable = read_identifier(fh)
                if variable != "":
                    indentation_setup.specify_grid(pattern_str, trigger_set,
                                                   variable, fh)
                else:
                    error_msg(
                        "Missing integer or variable name after keyword 'grid'.",
                        fh)

        elif identifier == "bad":
            indentation_setup.specify_bad(pattern_str, trigger_set, fh)

        elif identifier == "newline":
            indentation_setup.specify_newline(pattern_str, pattern.sm, fh)

        elif identifier == "suppressor":
            indentation_setup.specify_suppressor(pattern_str, pattern.sm, fh)

        else:
            assert False, "Unreachable code reached."

        if not check(fh, ";"):
            error_msg(
                "Missing ';' after indentation '%s' specification." %
                identifier, fh)