def parse_token_id_definitions(fh, NamesOnlyF=False): # NOTE: Catching of EOF happens in caller: parse_section(...) # token_prefix = Setup.token_id_prefix token_prefix_plain = Setup.token_id_prefix_plain # i.e. without name space included if NamesOnlyF: db = {} else: db = blackboard.token_id_db skip_whitespace(fh) if not check(fh, "{"): error_msg("missing opening '{' for after 'token' section identifier.\n", fh) while check(fh, "}") == False: skip_whitespace(fh) candidate = read_identifier(fh, TolerantF=True) if candidate == "": error_msg("Missing valid token identifier." % candidate, fh) # -- check the name, if it starts with the token prefix paste a warning if candidate.find(token_prefix) == 0: error_msg("Token identifier '%s' starts with token prefix '%s'.\n" % (candidate, token_prefix) + \ "Token prefix is mounted automatically. This token id appears in the source\n" + \ "code as '%s%s'." % (token_prefix, candidate), \ fh, DontExitF=True) elif candidate.find(token_prefix_plain) == 0: error_msg("Token identifier '%s' starts with token prefix '%s'.\n" % (candidate, token_prefix) + \ "Token prefix is mounted automatically. This token id appears in the source\n" + \ "code as '%s%s'." % (token_prefix, candidate), \ fh, DontExitF=True) skip_whitespace(fh) if NamesOnlyF: db[token_prefix + candidate] = True if check(fh, ";") == False: error_msg("Missing ';' after definition of token identifier '%s'.\n" % candidate + \ "This is mandatory since Quex version 0.50.1.", fh) continue # Parse a possible numeric value after '=' numeric_value = None if check(fh, "="): skip_whitespace(fh) numeric_value = read_integer(fh) if numeric_value is None: error_msg("Missing number after '=' for token identifier '%s'." % candidate, fh) if check(fh, ";") == False: error_msg("Missing ';' after definition of token identifier '%s'.\n" % candidate + \ "This is mandatory since Quex version 0.50.1.", fh) db[candidate] = TokenInfo(candidate, numeric_value, Filename=fh.name, LineN=get_current_line_info_number(fh)) if NamesOnlyF: result = db.keys() result.sort() return result
def parse_state_statistics(fh): skip_whitespace(fh) check_or_die(fh, "{") check_or_die(fh, "mode:") skip_whitespace(fh) mode_name = read_until_letter(fh, ";") check_or_die(fh, "state:") skip_whitespace(fh) state_index = read_integer(fh) check_or_die(fh, ";") check_or_die(fh, "{") skip_whitespace(fh) boundary_list_str = read_until_letter(fh, ";") skip_whitespace(fh) counter_list_str = read_until_letter(fh, ";") skip_whitespace(fh) check_or_die(fh, "}") check_or_die(fh, "}") def help(X): map(lambda x: int(x), X.strip().split()) return StateStatistics(mode_name, state_index, help(boundary_list_str), help(counter_list_str))
def parse_state_statistics(fh): skip_whitespace(fh) check_or_die(fh, "{") check_or_die(fh, "mode:") skip_whitespace(fh) mode_name = read_until_letter(fh, ";") check_or_die(fh, "state:") skip_whitespace(fh) state_index = read_integer(fh) check_or_die(fh, ";") check_or_die(fh, "{") skip_whitespace(fh) boundary_list_str = read_until_letter(fh, ";") skip_whitespace(fh) counter_list_str = read_until_letter(fh, ";") skip_whitespace(fh) check_or_die(fh, "}") check_or_die(fh, "}") def help(X): map(lambda x: int(x), X.strip().split()) return StateStatistics(mode_name, state_index, help(boundary_list_str), help(counter_list_str))
def get_codec_transformation_info(Codec=None, FileName=None, FH=-1, LineN=None): """Provides the information about the relation of character codes in a particular coding to unicode character codes. It is provided in the following form: # Codec Values Unicode Values [ (Source0_Begin, Source0_End, TargetInterval0_Begin), (Source1_Begin, Source1_End, TargetInterval1_Begin), (Source2_Begin, Source2_End, TargetInterval2_Begin), ... ] Arguments FH and LineN correspond to the arguments of error_msg. """ assert Codec is not None or FileName is not None if FileName is not None: file_name = FileName else: distinct_codec = __get_distinct_codec_name_for_alias(Codec) file_name = __codec_db_path + "/%s.dat" % distinct_codec fh = open_file_or_die(file_name, "rb") # Read coding into data structure transformation_list = [] try: while 1 + 1 == 2: skip_whitespace(fh) source_begin = read_integer(fh) if source_begin is None: error_msg("Missing integer (source interval begin) in codec file.", fh) skip_whitespace(fh) source_size = read_integer(fh) if source_size is None: error_msg("Missing integer (source interval size) in codec file.", fh) skip_whitespace(fh) target_begin = read_integer(fh) if target_begin is None: error_msg("Missing integer (target interval begin) in codec file.", fh) source_end = source_begin + source_size transformation_list.append([source_begin, source_end, target_begin]) except EndOfStreamException: pass return transformation_list
def do(section_list, fh): """Parses a codec information file. The described codec can only be a 'static character length' encoding. That is every character in the code occupies the same number of bytes. RETURNS: [0] Set of characters in unicode which are covered by the described codec. [1] Range of values in the codec elements. """ source_set = NumberSet() drain_set = NumberSet() error_str = None try: while error_str is None: skip_whitespace(fh) source_begin = read_integer(fh) if source_begin is None: error_str = "Missing integer (source interval begin) in codec file." continue skip_whitespace(fh) source_size = read_integer(fh) if source_size is None: error_str = "Missing integer (source interval size) in codec file." continue skip_whitespace(fh) target_begin = read_integer(fh) if target_begin is None: error_str = "Missing integer (target interval begin) in codec file." continue source_end = source_begin + source_size list.append(section_list, [source_begin, source_end, target_begin]) source_set.add_interval(Interval(source_begin, source_end)) drain_set.add_interval( Interval(target_begin, target_begin + source_size)) except EndOfStreamException: pass return source_set, drain_set, error_str
def do(section_list, fh): """Parses a codec information file. The described codec can only be a 'static character length' encoding. That is every character in the code occupies the same number of bytes. RETURNS: [0] Set of characters in unicode which are covered by the described codec. [1] Range of values in the codec elements. """ source_set = NumberSet() drain_set = NumberSet() error_str = None try: while error_str is None: skip_whitespace(fh) source_begin = read_integer(fh) if source_begin is None: error_str = "Missing integer (source interval begin) in codec file." continue skip_whitespace(fh) source_size = read_integer(fh) if source_size is None: error_str = "Missing integer (source interval size) in codec file." continue skip_whitespace(fh) target_begin = read_integer(fh) if target_begin is None: error_str = "Missing integer (target interval begin) in codec file." continue source_end = source_begin + source_size list.append(section_list, [source_begin, source_end, target_begin]) source_set.add_interval(Interval(source_begin, source_end)) drain_set.add_interval(Interval(target_begin, target_begin + source_size)) except EndOfStreamException: pass return source_set, drain_set, error_str
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": fh.seek(pos); return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do(fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code is None: error.log("Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error.log("Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": fh.seek(pos); return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": fh.seek(pos); return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: error.verify_word_in_list(ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error.log("%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code is not None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1
def _read_value_specifier(fh, Keyword, Default=None): skip_whitespace(fh) value = read_integer(fh) if value is not None: return value # not a number received, is it an identifier? variable = read_identifier(fh) if variable != "": return variable elif Default is not None: return Default error.log("Missing integer or variable name after keyword '%s'." % Keyword, fh)
def __get_integer(MemberName): ValueStr = setup.__dict__[MemberName] if type(ValueStr) == int: return ValueStr result = read_integer(StringIO(ValueStr)) if result is None: option_name = repr(SETUP_INFO[MemberName][0])[1:-1] error_msg("Cannot convert '%s' into an integer for '%s'.\n" % (ValueStr, option_name) + \ "Use prefix '0x' for hexadecimal numbers.\n" + \ " '0o' for octal numbers.\n" + \ " '0b' for binary numbers.\n" + \ " '0r' for roman numbers.\n" + \ " and no prefix for decimal numbers.") return result
def __get_integer(MemberName): ValueStr = setup.__dict__[MemberName] if type(ValueStr) == int: return ValueStr result = read_integer(StringIO(ValueStr)) if result is None: option_name = repr(SETUP_INFO[MemberName][0])[1:-1] error_msg("Cannot convert '%s' into an integer for '%s'.\n" % (ValueStr, option_name) + \ "Use prefix '0x' for hexadecimal numbers.\n" + \ " '0o' for octal numbers.\n" + \ " '0b' for binary numbers.\n" + \ " '0r' for roman numbers.\n" + \ " and no prefix for decimal numbers.") return result
def test(Input, Cmp=None, Base=10): print "%s%s --> " % (Input, " " * (10 - len(Input))), try: output = read_integer(StringIO(Input)) except: print return if Base == 10: print output, else: print "%X" % output, if Cmp is None: print else: print Cmp == output
def test(Input, Cmp=None, Base=10): print "%s%s --> " % (Input, " " * (10 - len(Input))), try: output = read_integer(StringIO(Input)) except: print return if Base == 10: print output, else: print "%X" % output, if Cmp is None: print else: print Cmp == output
def parse_token_id_definitions(fh, NamesOnlyF=False): """NamesOnlyF == True: Allow only definition of names, no numeric values may be assigned to it. 'NamesOnlyF' indicates that data is not written to the global 'token_id_db'. Then only a list of names is returned. """ # NOTE: Catching of EOF happens in caller: parse_section(...) # prefix = Setup.token_id_prefix prefix_plain = Setup.token_id_prefix_plain # i.e. without name space included if NamesOnlyF: result = set() skip_whitespace(fh) if not check(fh, "{"): error.log("Missing opening '{' for after 'token' section identifier.", fh) while check(fh, "}") == False: skip_whitespace(fh) candidate = read_identifier( fh, TolerantF=True, OnMissingStr="Missing valid token identifier.") # -- check the name, if it starts with the token prefix paste a warning suspicious_prefix = None if len(prefix) != 0 and candidate.find(prefix) == 0: suspicious_prefix = prefix elif len(prefix_plain) != 0 and candidate.find(prefix_plain) == 0: suspicious_prefix = prefix_plain if suspicious_prefix is not None: error.warning("Token identifier '%s' starts with token prefix '%s'.\n" \ % (candidate, suspicious_prefix) \ + "Token prefix is mounted automatically. This token id appears in the source\n" \ + "code as '%s%s'." \ % (prefix, candidate), \ fh, SuppressCode=NotificationDB.warning_token_id_prefix_appears_in_token_id_name) skip_whitespace(fh) if NamesOnlyF: result.add(prefix + candidate) if check(fh, ";") == False: error.log("Missing ';' after token identifier '%s'.\n" \ % candidate, fh) continue # Parse a possible numeric value after '=' numeric_value = None if check(fh, "="): skip_whitespace(fh) numeric_value = read_integer(fh) if numeric_value is None: error.log( "Missing number after '=' for token identifier '%s'." % candidate, fh) if check(fh, ";") == False: error.log("Missing ';' after token identifier '%s'." % candidate, fh) if not NamesOnlyF: ti = TokenInfo(candidate, numeric_value, SourceReference=SourceRef.from_FileHandle(fh)) blackboard.token_id_db[candidate] = ti if NamesOnlyF: return sorted(list(result)) else: return # Changes are applied to 'blackboard.token_id_db'
def parse_token_id_definitions(fh, NamesOnlyF=False): """NamesOnlyF == True: Allow only definition of names, no numeric values may be assigned to it. 'NamesOnlyF' indicates that data is not written to the global 'token_id_db'. Then only a list of names is returned. """ # NOTE: Catching of EOF happens in caller: parse_section(...) # prefix = Setup.token_id_prefix prefix_plain = Setup.token_id_prefix_plain # i.e. without name space included if NamesOnlyF: result = set() skip_whitespace(fh) if not check(fh, "{"): error.log("Missing opening '{' for after 'token' section identifier.", fh) while check(fh, "}") == False: skip_whitespace(fh) candidate = read_identifier(fh, TolerantF=True, OnMissingStr="Missing valid token identifier.") # -- check the name, if it starts with the token prefix paste a warning suspicious_prefix = None if len(prefix) != 0 and candidate.find(prefix) == 0: suspicious_prefix = prefix elif len(prefix_plain) != 0 and candidate.find(prefix_plain) == 0: suspicious_prefix = prefix_plain if suspicious_prefix is not None: error.warning("Token identifier '%s' starts with token prefix '%s'.\n" \ % (candidate, suspicious_prefix) \ + "Token prefix is mounted automatically. This token id appears in the source\n" \ + "code as '%s%s'." \ % (prefix, candidate), \ fh, SuppressCode=NotificationDB.warning_token_id_prefix_appears_in_token_id_name) skip_whitespace(fh) if NamesOnlyF: result.add(prefix + candidate) if check(fh, ";") == False: error.log("Missing ';' after token identifier '%s'.\n" \ % candidate, fh) continue # Parse a possible numeric value after '=' numeric_value = None if check(fh, "="): skip_whitespace(fh) numeric_value = read_integer(fh) if numeric_value is None: error.log("Missing number after '=' for token identifier '%s'." % candidate, fh) if check(fh, ";") == False: error.log("Missing ';' after token identifier '%s'." % candidate, fh) if not NamesOnlyF: ti = TokenInfo(candidate, numeric_value, SourceReference=SourceRef.from_FileHandle(fh)) blackboard.token_id_db[candidate] = ti if NamesOnlyF: return sorted(list(result)) else: return # Changes are applied to 'blackboard.token_id_db'
def do(fh): """Parses pattern definitions of the form: [ \t] => grid 4; [:intersection([:alpha:], [\X064-\X066]):] => space 1; In other words the right hand side *must* be a character set. """ indentation_setup = IndentationSetup(fh) # NOTE: Catching of EOF happens in caller: parse_section(...) # skip_whitespace(fh) while 1 + 1 == 2: skip_whitespace(fh) if check(fh, ">"): indentation_setup.seal() indentation_setup.consistency_check(fh) return indentation_setup # A regular expression state machine pattern_str, pattern = regular_expression.parse(fh) skip_whitespace(fh) if not check(fh, "=>"): error_msg("Missing '=>' after character set definition.", fh) skip_whitespace(fh) identifier = read_identifier(fh) if identifier == "": error_msg("Missing identifier for indentation element definition.", fh) verify_word_in_list( identifier, ["space", "grid", "bad", "newline", "suppressor"], "Unrecognized indentation specifier '%s'." % identifier, fh) trigger_set = None if identifier in ["space", "bad", "grid"]: if len(pattern.sm.states) != 2: error_msg("For indentation '%s' only patterns are addmissible which\n" % identifier + \ "can be matched by a single character, e.g. \" \" or [a-z].", fh) transition_map = pattern.sm.get_init_state().transitions().get_map( ) assert len(transition_map) == 1 trigger_set = transition_map.values()[0] skip_whitespace(fh) if identifier == "space": value = read_integer(fh) if value is not None: indentation_setup.specify_space(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? variable = read_identifier(fh) if variable != "": indentation_setup.specify_space(pattern_str, trigger_set, variable, fh) else: indentation_setup.specify_space(pattern_str, trigger_set, 1, fh) elif identifier == "grid": value = read_integer(fh) if value is not None: indentation_setup.specify_grid(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? skip_whitespace(fh) variable = read_identifier(fh) if variable != "": indentation_setup.specify_grid(pattern_str, trigger_set, variable, fh) else: error_msg( "Missing integer or variable name after keyword 'grid'.", fh) elif identifier == "bad": indentation_setup.specify_bad(pattern_str, trigger_set, fh) elif identifier == "newline": indentation_setup.specify_newline(pattern_str, pattern.sm, fh) elif identifier == "suppressor": indentation_setup.specify_suppressor(pattern_str, pattern.sm, fh) else: assert False, "Unreachable code reached." if not check(fh, ";"): error_msg( "Missing ';' after indentation '%s' specification." % identifier, fh)
def do(fh): """Parses pattern definitions of the form: [ \t] => grid 4; [:intersection([:alpha:], [\X064-\X066]):] => space 1; In other words the right hand side *must* be a character set. """ indentation_setup = IndentationSetup(fh) # NOTE: Catching of EOF happens in caller: parse_section(...) # skip_whitespace(fh) while 1 + 1 == 2: skip_whitespace(fh) if check(fh, ">"): indentation_setup.seal() indentation_setup.consistency_check(fh) return indentation_setup # A regular expression state machine pattern_str, pattern = regular_expression.parse(fh) skip_whitespace(fh) if not check(fh, "=>"): error_msg("Missing '=>' after character set definition.", fh) skip_whitespace(fh) identifier = read_identifier(fh) if identifier == "": error_msg("Missing identifier for indentation element definition.", fh) verify_word_in_list(identifier, ["space", "grid", "bad", "newline", "suppressor"], "Unrecognized indentation specifier '%s'." % identifier, fh) trigger_set = None if identifier in ["space", "bad", "grid"]: if len(pattern.sm.states) != 2: error_msg("For indentation '%s' only patterns are addmissible which\n" % identifier + \ "can be matched by a single character, e.g. \" \" or [a-z].", fh) transition_map = pattern.sm.get_init_state().transitions().get_map() assert len(transition_map) == 1 trigger_set = transition_map.values()[0] skip_whitespace(fh) if identifier == "space": value = read_integer(fh) if value is not None: indentation_setup.specify_space(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? variable = read_identifier(fh) if variable != "": indentation_setup.specify_space(pattern_str, trigger_set, variable, fh) else: indentation_setup.specify_space(pattern_str, trigger_set, 1, fh) elif identifier == "grid": value = read_integer(fh) if value is not None: indentation_setup.specify_grid(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? skip_whitespace(fh) variable = read_identifier(fh) if variable != "": indentation_setup.specify_grid(pattern_str, trigger_set, variable, fh) else: error_msg("Missing integer or variable name after keyword 'grid'.", fh) elif identifier == "bad": indentation_setup.specify_bad(pattern_str, trigger_set, fh) elif identifier == "newline": indentation_setup.specify_newline(pattern_str, pattern.sm, fh) elif identifier == "suppressor": indentation_setup.specify_suppressor(pattern_str, pattern.sm, fh) else: assert False, "Unreachable code reached." if not check(fh, ";"): error_msg("Missing ';' after indentation '%s' specification." % identifier, fh)
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": fh.seek(pos) return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do( fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code is None: error_msg( "Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg( "Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": fh.seek(pos) return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": fh.seek(pos) return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: verify_word_in_list( ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error_msg( "%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code is not None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1
def parse_token_id_definitions(fh, NamesOnlyF=False): # NOTE: Catching of EOF happens in caller: parse_section(...) # token_prefix = Setup.token_id_prefix token_prefix_plain = Setup.token_id_prefix_plain # i.e. without name space included if NamesOnlyF: db = {} else: db = blackboard.token_id_db skip_whitespace(fh) if not check(fh, "{"): error_msg( "missing opening '{' for after 'token' section identifier.\n", fh) while check(fh, "}") == False: skip_whitespace(fh) candidate = read_identifier(fh, TolerantF=True) if candidate == "": error_msg("Missing valid token identifier." % candidate, fh) # -- check the name, if it starts with the token prefix paste a warning if candidate.find(token_prefix) == 0: error_msg("Token identifier '%s' starts with token prefix '%s'.\n" % (candidate, token_prefix) + \ "Token prefix is mounted automatically. This token id appears in the source\n" + \ "code as '%s%s'." % (token_prefix, candidate), \ fh, DontExitF=True) elif candidate.find(token_prefix_plain) == 0: error_msg("Token identifier '%s' starts with token prefix '%s'.\n" % (candidate, token_prefix) + \ "Token prefix is mounted automatically. This token id appears in the source\n" + \ "code as '%s%s'." % (token_prefix, candidate), \ fh, DontExitF=True) skip_whitespace(fh) if NamesOnlyF: db[token_prefix + candidate] = True if check(fh, ";") == False: error_msg("Missing ';' after definition of token identifier '%s'.\n" % candidate + \ "This is mandatory since Quex version 0.50.1.", fh) continue # Parse a possible numeric value after '=' numeric_value = None if check(fh, "="): skip_whitespace(fh) numeric_value = read_integer(fh) if numeric_value is None: error_msg( "Missing number after '=' for token identifier '%s'." % candidate, fh) if check(fh, ";") == False: error_msg("Missing ';' after definition of token identifier '%s'.\n" % candidate + \ "This is mandatory since Quex version 0.50.1.", fh) db[candidate] = TokenInfo(candidate, numeric_value, Filename=fh.name, LineN=get_current_line_info_number(fh)) if NamesOnlyF: result = db.keys() result.sort() return result