def get_character_code_sequence(sh): assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" # Only \" is a special character '"', any other backslashed character # remains as the sequence 'backslash' + character sequence = [] while 1 + 1 == 2: char_code = utf8.__read_one_utf8_code_from_stream(sh) if char_code is None: raise RegularExpressionException( "End of file reached while parsing quoted string.") elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if char_code is None: raise RegularExpressionException( "Unidentified backslash-sequence in quoted string.") elif char_code == ord('"'): break sequence.append(char_code) return sequence
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": fh.seek(pos); return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do(fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code is None: error.log("Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error.log("Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": fh.seek(pos); return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": fh.seek(pos); return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: error.verify_word_in_list(ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error.log("%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code is not None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1
def test(Title, TestString): print Title + ":" print " characters = \"" + TestString + "\"" result_list = [] sh = StringIO(TestString) while 1 + 1 == 2: letter = sh.read(1) if letter == "": break if letter != "\\": print "[end of sequence]" break code = snap.do(sh) result_list.append("%04X" % code) print " character codes = " + repr(result_list)
def get_character_code_sequence(sh): assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" # Only \" is a special character '"', any other backslashed character # remains as the sequence 'backslash' + character sequence = [] while 1 + 1 == 2: char_code = utf8.__read_one_utf8_code_from_stream(sh) if char_code is None: raise RegularExpressionException("End of file reached while parsing quoted string.") elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if char_code is None: raise RegularExpressionException("Unidentified backslash-sequence in quoted string.") elif char_code == ord('"'): break sequence.append(char_code) return sequence
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": fh.seek(pos) return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do( fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code is None: error_msg( "Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg( "Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": fh.seek(pos) return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": fh.seek(pos) return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: verify_word_in_list( ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error_msg( "%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code is not None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position) return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None quote_checker = DoubleQuoteChecker( ) # Checks for " appearing twice. Some users did use # # constructs such as "-" and ended up in confusing behavior. while 1 + 1 == 2: char_code = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code) if char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' requires a preceding character as in 'a-z'." ) elif char_code is None: raise RegularExpressionException( "Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code_2) if char_code_2 in [None, ord(']')]: raise RegularExpressionException( "Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 if char_code == char_code_2: utf8_string = utf8.map_unicode_to_utf8(char_code) raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \ % (utf8_string, utf8_string) + \ "In this case avoid range expression for clarity.") tracker.consider_interval(char_code, char_code_2 + 1) if char_code is None: break if tracker.negation_f: return tracker.match_set.get_complement( Setup.buffer_encoding.source_set) else: return tracker.match_set
def snap_primary(stream, PatternDict): """primary: " non_double_quote * " = character string [ non_rect_bracket_close ] = set of characters { identifier } = pattern replacement ( expression ) non_control_character+ = lonely characters primary repetition_cmd """ __debug_entry("primary", stream) x = stream.read(1) lookahead = stream.read(1) if x != "" and lookahead != "": stream.seek(-1, 1) if x == "": return __debug_exit(None, stream) # -- 'primary' primary if x == "\"": result = snap_character_string.do(stream) elif x == "[": stream.seek(-1, 1) result = character_set_expression.do(stream, PatternDict) elif x == "{": result = snap_replacement(stream, PatternDict) elif x == ".": result = create_ALL_BUT_NEWLINE_state_machine() elif x == "(": result = snap_bracketed_expression(stream, PatternDict) elif x.isspace(): # a lonestanding space ends the regular expression stream.seek(-1, 1) return __debug_exit(None, stream) elif x in ["*", "+", "?"]: raise RegularExpressionException( "lonely operator '%s' without expression proceeding." % x) elif x == "\\": if lookahead == "C": stream.read(1) result = snap_case_folded_pattern(stream, PatternDict) elif lookahead == "R": result = get_expression_in_brackets(stream, PatternDict, "reverse operator", "R").get_inverse() elif lookahead == "A": result = get_expression_in_brackets(stream, PatternDict, "anti-pattern operator", "A") result.transform_to_anti_pattern() else: stream.seek(-1, 1) trigger_set = character_set_expression.snap_property_set(stream) if trigger_set is None: stream.seek( 1, 1) # snap_property_set() leaves tream right before '\\' char_code = snap_backslashed_character.do(stream) if char_code is None: raise RegularExpressionException( "Backslash followed by unrecognized character code.") trigger_set = char_code result = StateMachine() result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) elif x not in CONTROL_CHARACTERS: # NOTE: The '\' is not inside the control characters---for a reason. # It is used to define for example character codes using '\x' etc. stream.seek(-1, 1) result = snap_non_control_character(stream, PatternDict) else: # NOTE: This includes the '$' sign which means 'end of line' # because the '$' sign is in CONTROL_CHARACTERS, but is not checked # against. Thus, it it good to leave here on '$' because the # '$' sign is handled on the very top level. # this is not a valid primary stream.seek(-1, 1) return __debug_exit(None, stream) # -- optional repetition command? result_repeated = __snap_repetition_range(result, stream) if result_repeated is not None: result = result_repeated return __debug_exit(beautifier.do(result), stream)
def snap_primary(stream, PatternDict): """primary: " non_double_quote * " = character string [ non_rect_bracket_close ] = set of characters { identifier } = pattern replacement ( expression ) non_control_character+ = lonely characters primary repetition_cmd """ __debug_entry("primary", stream) x = stream.read(1); lookahead = stream.read(1); if x != "" and lookahead != "": stream.seek(-1, 1) if x == "": return __debug_exit(None, stream) # -- 'primary' primary if x == "\"": result = snap_character_string.do(stream) elif x == "[": stream.seek(-1, 1); result = character_set_expression.do(stream, PatternDict) elif x == "{": result = snap_replacement(stream, PatternDict) elif x == ".": result = create_ALL_BUT_NEWLINE_state_machine() elif x == "(": result = snap_bracketed_expression(stream, PatternDict) elif x.isspace(): # a lonestanding space ends the regular expression stream.seek(-1, 1) return __debug_exit(None, stream) elif x in ["*", "+", "?"]: raise RegularExpressionException("lonely operator '%s' without expression proceeding." % x) elif x == "\\": if lookahead == "C": stream.read(1) result = snap_case_folded_pattern(stream, PatternDict) elif lookahead == "R": result = get_expression_in_brackets(stream, PatternDict, "reverse operator", "R").get_inverse() elif lookahead == "A": result = get_expression_in_brackets(stream, PatternDict, "anti-pattern operator", "A") result.transform_to_anti_pattern() else: stream.seek(-1, 1) trigger_set = character_set_expression.snap_property_set(stream) if trigger_set is None: stream.seek(1, 1) # snap_property_set() leaves tream right before '\\' char_code = snap_backslashed_character.do(stream) if char_code is None: raise RegularExpressionException("Backslash followed by unrecognized character code.") trigger_set = char_code result = StateMachine() result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) elif x not in CONTROL_CHARACTERS: # NOTE: The '\' is not inside the control characters---for a reason. # It is used to define for example character codes using '\x' etc. stream.seek(-1, 1) result = snap_non_control_character(stream, PatternDict) else: # NOTE: This includes the '$' sign which means 'end of line' # because the '$' sign is in CONTROL_CHARACTERS, but is not checked # against. Thus, it it good to leave here on '$' because the # '$' sign is handled on the very top level. # this is not a valid primary stream.seek(-1, 1) return __debug_exit(None, stream) # -- optional repetition command? result_repeated = __snap_repetition_range(result, stream) if result_repeated is not None: result = result_repeated return __debug_exit(beautifier.do(result), stream)
def snap_primary(stream, PatternDict): """primary: " non_double_quote * " = character string [ non_rect_bracket_close ] = set of characters { identifier } = pattern replacement ( expression ) non_control_character+ = lonely characters primary repetition_cmd """ global SPECIAL_TERMINATOR __debug_entry("primary", stream) x = stream.read(1) if x == "": return __debug_exit(None, stream) # -- 'primary' primary if x == "\"": result = snap_character_string.do(stream) elif x == "[": stream.seek(-1, 1); result = snap_character_set_expression(stream, PatternDict) elif x == "{": result = snap_replacement(stream, PatternDict) elif x == ".": result = create_ALL_BUT_NEWLINE_state_machine(stream) elif x == "(": result = snap_bracketed_expression(stream, PatternDict) elif x.isspace(): # a lonestanding space ends the regular expression stream.seek(-1, 1) return __debug_exit(None, stream) elif x in ["*", "+", "?"]: raise RegularExpressionException("lonely operator '%s' without expression proceeding." % x) elif x == "\\": result = snap_command(stream, PatternDict) if result is None: stream.seek(-1, 1) trigger_set = snap_property_set(stream) if trigger_set is None: # snap the '\' stream.read(1) char_code = snap_backslashed_character.do(stream) if char_code is None: raise RegularExpressionException("Backslash followed by unrecognized character code.") trigger_set = char_code result = StateMachine() result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) elif x not in CONTROL_CHARACTERS and x != SPECIAL_TERMINATOR: # NOTE: The '\' is not inside the control characters---for a reason. # It is used to define for example character codes using '\x' etc. stream.seek(-1, 1) result = snap_non_control_character(stream, PatternDict) else: # NOTE: This includes the '$' sign which means 'end of line' # because the '$' sign is in CONTROL_CHARACTERS, but is not checked # against. Thus, it it good to leave here on '$' because the # '$' sign is handled on the very top level. # this is not a valid primary stream.seek(-1, 1) return __debug_exit(None, stream) # -- optional repetition command? result_repeated = __snap_repetition_range(result, stream) if result_repeated is not None: result = result_repeated # There's something going wrong with pseudo-ambigous post context # if we do not clean-up here. TODO: Investigate why? # See tests in generator/TEST directory. return __debug_exit(beautifier.do(result), stream)
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position); return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None quote_checker = DoubleQuoteChecker() # Checks for " appearing twice. Some users did use # # constructs such as "-" and ended up in confusing behavior. while 1 + 1 == 2: char_code = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code) if char_code == ord("-"): raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.") elif char_code is None: raise RegularExpressionException("Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code_2) if char_code_2 in [None, ord(']')]: raise RegularExpressionException("Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException("Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 if char_code == char_code_2: utf8_string = utf8.map_unicode_to_utf8(char_code) raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \ % (utf8_string, utf8_string) + \ "In this case avoid range expression for clarity.") tracker.consider_interval(char_code, char_code_2 + 1) if char_code is None: break if tracker.negation_f: return tracker.match_set.get_complement(Setup.buffer_codec.source_set) else: return tracker.match_set