def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": seek(pos); return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do(fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code == 0xFF: error_msg("Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg("Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": seek(pos); return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": seek(pos); return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: verify_word_in_list(ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code != None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1
def do_shortcut(stream, ShortcutLetter, PropertyAlias): """Name property shortcut '\ShortcutLetter{...}' which is a shortcut for '\P{PropertyAlias=...}'. Parse an expression of the form '\N{CHARACTER NAME}' and return the related character set of characters that match the given name. Wildcards in are allowed. """ content = __parse_property_expression(stream, ShortcutLetter, EqualConditionPossibleF=False) # if len(content) != 1 then an exception is thrown property_value = content[0] result = ucs_property_db.get_character_set(PropertyAlias, property_value) if type(result) == str: raise RegularExpressionException(result) return result
def do(stream): """Property expression: '\P{...}' Parse an expression of the forms: '\P{property = value}' or '\P{binary_property}' and return the related character set. """ content = __parse_property_expression(stream, "P") # if len(content) < 1 or > 2 then an exception is thrown property_name = content[0] if len(content) == 1: property_value = None else: property_value = content[1] result = ucs_property_db.get_character_set(property_name, property_value) if type(result) == str: raise RegularExpressionException(result) return result
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": seek(pos); return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' character_code = __read_one_utf8_code_from_stream(fh) if character_code == 0xFF: error_msg("Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg("Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": seek(pos); return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = read_identifier(fh) if ucs_name == "": seek(pos); return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: error_msg("%s does not identify a known unicode character." % ucs_name, fh) if type(character_code) not in [int, long]: error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) return character_code second = fh.read(1) if start == "0" and second.isdigit() == False: base = second if base not in ["x", "o", "b"]: error_msg("Number base '0%s' is unknown, please use '0x' for hexidecimal,\n" % base + \ "'0o' for octal, or '0b' for binary.", fh) number_txt = read_integer(fh) if number_txt == "": error_msg("Missing integer number after '0%s'" % base, fh) try: if base == "x": character_code = int("0x" + number_txt, 16) elif base == "o": character_code = int(number_txt, 8) elif base == "b": character_code = 0 for letter in number_txt: character_code = character_code << 1 if letter == "1": character_code += 1 elif letter != "0": error_msg("Letter '%s' not permitted in binary number (something start with '0b')" % letter, fh) else: # A normal integer number (starting with '0' though) character_code = int(base + number_text) except: error_msg("The string '%s' is not appropriate for number base '0%s'." % (number_txt, base), fh) return character_code elif start.isdigit(): fh.seek(-2, 1) # undo 'start' and 'second' # All that remains is that it is a 'normal' integer number_txt = read_integer(fh) if number_txt == "": fh.seek(pos); return -1 try: return int(number_txt) except: error_msg("The string '%s' is not appropriate for number base '10'." % number_txt, fh) else: # Try to interpret it as something else ... fh.seek(pos); return -1
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": seek(pos) return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' character_code = __read_one_utf8_code_from_stream(fh) if character_code == 0xFF: error_msg( "Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg( "Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": seek(pos) return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = read_identifier(fh) if ucs_name == "": seek(pos) return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: error_msg( "%s does not identify a known unicode character." % ucs_name, fh) if type(character_code) not in [int, long]: error_msg( "%s relates to more than one character in unicode database." % ucs_name, fh) return character_code second = fh.read(1) if start == "0" and second.isdigit() == False: base = second if base not in ["x", "o", "b"]: error_msg("Number base '0%s' is unknown, please use '0x' for hexidecimal,\n" % base + \ "'0o' for octal, or '0b' for binary.", fh) number_txt = read_integer(fh) if number_txt == "": error_msg("Missing integer number after '0%s'" % base, fh) try: if base == "x": character_code = int("0x" + number_txt, 16) elif base == "o": character_code = int(number_txt, 8) elif base == "b": character_code = 0 for letter in number_txt: character_code = character_code << 1 if letter == "1": character_code += 1 elif letter != "0": error_msg( "Letter '%s' not permitted in binary number (something start with '0b')" % letter, fh) else: # A normal integer number (starting with '0' though) character_code = int(base + number_text) except: error_msg( "The string '%s' is not appropriate for number base '0%s'." % (number_txt, base), fh) return character_code elif start.isdigit(): fh.seek(-2, 1) # undo 'start' and 'second' # All that remains is that it is a 'normal' integer number_txt = read_integer(fh) if number_txt == "": fh.seek(pos) return -1 try: return int(number_txt) except: error_msg( "The string '%s' is not appropriate for number base '10'." % number_txt, fh) else: # Try to interpret it as something else ... fh.seek(pos) return -1
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": seek(pos) return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do( fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code == 0xFF: error_msg( "Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg( "Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": seek(pos) return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": seek(pos) return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: verify_word_in_list( ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error_msg( "%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code != None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1