def consider_interval(self, Begin, End): if Begin > End: raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \ "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \ (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End)) self.match_set.add_interval(Interval(Begin, End))
def __utf8_char(self, Code): if Code == - sys.maxint: return "-oo" elif Code == sys.maxint: return "oo" elif Code == ord(' '): return "' '" elif Code == ord('\n'): return "'\\n'" elif Code == ord('\t'): return "'\\t'" elif Code == ord('\r'): return "'\\r'" elif Code < ord(' '): return "\\" + repr(Code) # from ' ' to '9' things are 'visible' else: char_str = utf8.map_unicode_to_utf8(Code) return "'" + char_str + "'"
def __print_set_single_characters(CharSet, Display, ScreenWidth): assert Display in ["hex", "utf8"] if Display == "hex": CharactersPerLine = 8 ColumnWidth = 6 else: CharactersPerLine = 32 ColumnWidth = 2 # just to make sure ... character_list = CharacterList(CharSet) if character_list.is_empty(): sys.stdout.write("<Result = Empty Character Set>\n") return # Avoid memory overflow for very large sets: get character by character last_start_character_of_line = -1 last_horizontal_offset = 0 while 1 + 1 == 2: character_code = character_list.next() if character_code == None: break start_character_of_line = character_code - character_code % CharactersPerLine horizontal_offset = character_code - start_character_of_line if start_character_of_line > last_start_character_of_line + CharactersPerLine: sys.stdout.write("\n...") if start_character_of_line != last_start_character_of_line: sys.stdout.write("\n%05X: " % start_character_of_line) last_horizontal_offset = 0 sys.stdout.write(" " * ColumnWidth * (horizontal_offset - last_horizontal_offset - 1)) if Display == "hex": sys.stdout.write("%05X " % character_code) else: if character_code >= 0x20: sys.stdout.write("%s " % map_unicode_to_utf8(character_code)) else: sys.stdout.write("? ") last_start_character_of_line = start_character_of_line last_horizontal_offset = horizontal_offset
def __print_set_single_characters(CharSet, Display, ScreenWidth): assert Display in ["hex", "utf8"] interval_list = CharSet.get_intervals(PromiseNotToChangeAnythingF=True) if Display == "hex": CharactersPerLine = 8 ColumnWidth = 6 else: CharactersPerLine = 32 ColumnWidth = 2 txt = "" line_size = 0 character_list = [] for interval in interval_list: character_list.extend(range(interval.begin, interval.end)) # just to make sure ... character_list.sort() last_start_character_of_line = 0 last_horizontal_offset = 0 for character_code in character_list: start_character_of_line = character_code - character_code % CharactersPerLine horizontal_offset = character_code - start_character_of_line if start_character_of_line > last_start_character_of_line + CharactersPerLine: sys.stdout.write("\n...") if start_character_of_line != last_start_character_of_line: sys.stdout.write("\n%05X: " % start_character_of_line) last_horizontal_offset = 0 sys.stdout.write(" " * ColumnWidth * (horizontal_offset - last_horizontal_offset - 1)) if Display == "hex": sys.stdout.write("%05X " % character_code) else: sys.stdout.write("%s " % map_unicode_to_utf8(character_code)) last_start_character_of_line = start_character_of_line last_horizontal_offset = horizontal_offset
def get_range_skipper(EndSequence, LanguageDB, MissingClosingDelimiterAction=""): assert EndSequence.__class__ == list assert len(EndSequence) >= 1 assert map(type, EndSequence) == [int] * len(EndSequence) # Name the $$SKIPPER$$ skipper_index = sm_index.get() # Determine the $$DELIMITER$$ delimiter_str = "" delimiter_comment_str = " Delimiter: " for letter in EndSequence: delimiter_comment_str += "'%s', " % utf8.map_unicode_to_utf8(letter) delimiter_str += "0x%X, " % letter delimiter_length_str = "%i" % len(EndSequence) delimiter_comment_str = LanguageDB["$comment"](delimiter_comment_str) # Determine the check for the tail of the delimiter delimiter_remainder_test_str = "" if len(EndSequence) != 1: txt = "" i = 0 for letter in EndSequence[1:]: i += 1 txt += " " + LanguageDB["$input/get-offset"](i-1) + "\n" txt += " " + LanguageDB["$if !="]("Skipper$$SKIPPER_INDEX$$[%i]" % i) txt += " " + LanguageDB["$goto"]("$entry", skipper_index) + "\n" txt += " " + LanguageDB["$endif"] delimiter_remainder_test_str = txt # The main part code_str = blue_print(range_skipper_template, [["$$DELIMITER$$", delimiter_str], ["$$DELIMITER_LENGTH$$", delimiter_length_str], ["$$DELIMITER_COMMENT$$", delimiter_comment_str], ["$$WHILE_1_PLUS_1_EQUAL_2$$", LanguageDB["$loop-start-endless"]], ["$$END_WHILE$$", LanguageDB["$loop-end"]], ["$$INPUT_P_INCREMENT$$", LanguageDB["$input/increment"]], ["$$INPUT_P_DECREMENT$$", LanguageDB["$input/decrement"]], ["$$INPUT_GET$$", LanguageDB["$input/get"]], ["$$IF_INPUT_EQUAL_DELIMITER_0$$", LanguageDB["$if =="]("Skipper$$SKIPPER_INDEX$$[0]")], ["$$BREAK$$", LanguageDB["$break"]], ["$$ENDIF$$", LanguageDB["$endif"]], ["$$ENTRY$$", LanguageDB["$label-def"]("$entry", skipper_index)], ["$$DROP_OUT$$", LanguageDB["$label-def"]("$drop-out", skipper_index)], ["$$GOTO_ENTRY$$", LanguageDB["$goto"]("$entry", skipper_index)], ["$$GOTO_REENTRY_PREPARATION$$", LanguageDB["$goto"]("$re-start")], ["$$MARK_LEXEME_START$$", LanguageDB["$mark-lexeme-start"]], ["$$DELIMITER_REMAINDER_TEST$$", delimiter_remainder_test_str], ["$$SET_INPUT_P_BEHIND_DELIMITER$$", LanguageDB["$input/add"](len(EndSequence)-1)], ["$$MISSING_CLOSING_DELIMITER$$", MissingClosingDelimiterAction], ]) # Line and column number counting code_str = __range_skipper_lc_counting_replacements(code_str, EndSequence) # The finishing touch code_str = blue_print(code_str, [["$$SKIPPER_INDEX$$", __nice(skipper_index)], ["$$GOTO_DROP_OUT$$", LanguageDB["$goto"]("$drop-out", skipper_index)]]) return code_str
def get_range_skipper(EndSequence, LanguageDB, MissingClosingDelimiterAction=""): assert EndSequence.__class__ == list assert len(EndSequence) >= 1 assert map(type, EndSequence) == [int] * len(EndSequence) # Name the $$SKIPPER$$ skipper_index = sm_index.get() # Determine the $$DELIMITER$$ delimiter_str = "" delimiter_comment_str = " Delimiter: " for letter in EndSequence: delimiter_comment_str += "'%s', " % utf8.map_unicode_to_utf8(letter) delimiter_str += "0x%X, " % letter delimiter_length_str = "%i" % len(EndSequence) delimiter_comment_str = LanguageDB["$comment"](delimiter_comment_str) # Determine the check for the tail of the delimiter delimiter_remainder_test_str = "" if len(EndSequence) != 1: txt = "" i = 0 for letter in EndSequence[1:]: i += 1 txt += " " + LanguageDB["$input/get-offset"](i - 1) + "\n" txt += " " + LanguageDB["$if !="]( "Skipper$$SKIPPER_INDEX$$[%i]" % i) txt += " " + LanguageDB["$goto"]("$entry", skipper_index) + "\n" txt += " " + LanguageDB["$endif"] delimiter_remainder_test_str = txt # The main part code_str = blue_print(range_skipper_template, [ ["$$DELIMITER$$", delimiter_str], ["$$DELIMITER_LENGTH$$", delimiter_length_str], ["$$DELIMITER_COMMENT$$", delimiter_comment_str], ["$$WHILE_1_PLUS_1_EQUAL_2$$", LanguageDB["$loop-start-endless"]], ["$$END_WHILE$$", LanguageDB["$loop-end"]], ["$$INPUT_P_INCREMENT$$", LanguageDB["$input/increment"]], ["$$INPUT_P_DECREMENT$$", LanguageDB["$input/decrement"]], ["$$INPUT_GET$$", LanguageDB["$input/get"]], [ "$$IF_INPUT_EQUAL_DELIMITER_0$$", LanguageDB["$if =="]("Skipper$$SKIPPER_INDEX$$[0]") ], ["$$BREAK$$", LanguageDB["$break"]], ["$$ENDIF$$", LanguageDB["$endif"]], ["$$ENTRY$$", LanguageDB["$label-def"]("$entry", skipper_index)], ["$$DROP_OUT$$", LanguageDB["$label-def"]("$drop-out", skipper_index)], ["$$GOTO_ENTRY$$", LanguageDB["$goto"]("$entry", skipper_index)], ["$$GOTO_REENTRY_PREPARATION$$", LanguageDB["$goto"]("$re-start")], ["$$MARK_LEXEME_START$$", LanguageDB["$mark-lexeme-start"]], ["$$DELIMITER_REMAINDER_TEST$$", delimiter_remainder_test_str], [ "$$SET_INPUT_P_BEHIND_DELIMITER$$", LanguageDB["$input/add"](len(EndSequence) - 1) ], ["$$MISSING_CLOSING_DELIMITER$$", MissingClosingDelimiterAction], ]) # Line and column number counting code_str = __range_skipper_lc_counting_replacements(code_str, EndSequence) # The finishing touch code_str = blue_print(code_str, [[ "$$SKIPPER_INDEX$$", __nice(skipper_index) ], ["$$GOTO_DROP_OUT$$", LanguageDB["$goto"]("$drop-out", skipper_index)]]) return code_str
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position); return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None quote_checker = DoubleQuoteChecker() # Checks for " appearing twice. Some users did use # # constructs such as "-" and ended up in confusing behavior. while char_code != 0xFF: char_code = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code) if char_code == ord("-"): raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.") elif char_code == 0xFF: raise RegularExpressionException("Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code_2) if char_code_2 in [0xFF, ord(']')]: raise RegularExpressionException("Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException("Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 if char_code == char_code_2: utf8_string = utf8.map_unicode_to_utf8(char_code) raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \ % (utf8_string, utf8_string) + \ "In this case avoid range expression for clarity.") tracker.consider_interval(char_code, char_code_2 + 1) if tracker.negation_f: result = tracker.match_set.inverse() if Setup.get_character_value_limit() != sys.maxint: result.intersect_with(Interval(0, Setup.get_character_value_limit())) return result else: return tracker.match_set
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position) return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None quote_checker = DoubleQuoteChecker( ) # Checks for " appearing twice. Some users did use # # constructs such as "-" and ended up in confusing behavior. while char_code != 0xFF: char_code = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code) if char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' requires a preceding character as in 'a-z'." ) elif char_code == 0xFF: raise RegularExpressionException( "Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code_2) if char_code_2 in [0xFF, ord(']')]: raise RegularExpressionException( "Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 if char_code == char_code_2: utf8_string = utf8.map_unicode_to_utf8(char_code) raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \ % (utf8_string, utf8_string) + \ "In this case avoid range expression for clarity.") tracker.consider_interval(char_code, char_code_2 + 1) if tracker.negation_f: result = tracker.match_set.inverse() if Setup.get_character_value_limit() != sys.maxint: result.intersect_with( Interval(0, Setup.get_character_value_limit())) return result else: return tracker.match_set