Exemple #1
0
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))
Exemple #3
0
 def __utf8_char(self, Code):
     if   Code == - sys.maxint:   return "-oo"
     elif Code == sys.maxint:     return "oo"            
     elif Code == ord(' '):       return "' '"
     elif Code == ord('\n'):      return "'\\n'"
     elif Code == ord('\t'):      return "'\\t'"
     elif Code == ord('\r'):      return "'\\r'"
     elif Code < ord(' '):        return "\\" + repr(Code) #  from ' ' to '9' things are 'visible'
     else:
         char_str = utf8.map_unicode_to_utf8(Code)
         return "'" + char_str + "'"
Exemple #4
0
def __print_set_single_characters(CharSet, Display, ScreenWidth):
    assert Display in ["hex", "utf8"]

    if Display == "hex":
        CharactersPerLine = 8
        ColumnWidth = 6
    else:
        CharactersPerLine = 32
        ColumnWidth = 2

    # just to make sure ...

    character_list = CharacterList(CharSet)
    if character_list.is_empty():
        sys.stdout.write("<Result = Empty Character Set>\n")
        return

    # Avoid memory overflow for very large sets: get character by character
    last_start_character_of_line = -1
    last_horizontal_offset = 0
    while 1 + 1 == 2:
        character_code = character_list.next()
        if character_code == None: break

        start_character_of_line = character_code - character_code % CharactersPerLine
        horizontal_offset = character_code - start_character_of_line

        if start_character_of_line > last_start_character_of_line + CharactersPerLine:
            sys.stdout.write("\n...")
        if start_character_of_line != last_start_character_of_line:
            sys.stdout.write("\n%05X: " % start_character_of_line)
            last_horizontal_offset = 0

        sys.stdout.write(" " * ColumnWidth *
                         (horizontal_offset - last_horizontal_offset - 1))

        if Display == "hex":
            sys.stdout.write("%05X " % character_code)
        else:
            if character_code >= 0x20:
                sys.stdout.write("%s " % map_unicode_to_utf8(character_code))
            else:
                sys.stdout.write("? ")

        last_start_character_of_line = start_character_of_line
        last_horizontal_offset = horizontal_offset
Exemple #5
0
def __print_set_single_characters(CharSet, Display, ScreenWidth):
    assert Display in ["hex", "utf8"]

    if Display == "hex":
        CharactersPerLine = 8
        ColumnWidth       = 6
    else:
        CharactersPerLine = 32
        ColumnWidth       = 2

    # just to make sure ...

    character_list = CharacterList(CharSet)
    if character_list.is_empty():
        sys.stdout.write("<Result = Empty Character Set>\n")
        return

    # Avoid memory overflow for very large sets: get character by character 
    last_start_character_of_line = -1
    last_horizontal_offset       = 0
    while 1 + 1 == 2:
        character_code = character_list.next()
        if character_code == None: break

        start_character_of_line = character_code - character_code % CharactersPerLine
        horizontal_offset       = character_code - start_character_of_line

        if start_character_of_line > last_start_character_of_line + CharactersPerLine: 
            sys.stdout.write("\n...")
        if start_character_of_line != last_start_character_of_line:
            sys.stdout.write("\n%05X: " % start_character_of_line)
            last_horizontal_offset = 0

        sys.stdout.write(" " * ColumnWidth * (horizontal_offset - last_horizontal_offset - 1))

        if Display == "hex":
            sys.stdout.write("%05X " % character_code)
        else:
            if character_code >= 0x20:
                sys.stdout.write("%s " % map_unicode_to_utf8(character_code))
            else:
                sys.stdout.write("? ")

        last_start_character_of_line = start_character_of_line
        last_horizontal_offset       = horizontal_offset
Exemple #6
0
def __print_set_single_characters(CharSet, Display, ScreenWidth):
    assert Display in ["hex", "utf8"]

    interval_list = CharSet.get_intervals(PromiseNotToChangeAnythingF=True)

    if Display == "hex":
        CharactersPerLine = 8
        ColumnWidth       = 6
    else:
        CharactersPerLine = 32
        ColumnWidth       = 2

    txt = ""
    line_size = 0
    character_list = []
    for interval in interval_list:
        character_list.extend(range(interval.begin, interval.end))

    # just to make sure ...
    character_list.sort()

    last_start_character_of_line = 0
    last_horizontal_offset       = 0
    for character_code in character_list:
        start_character_of_line = character_code - character_code % CharactersPerLine
        horizontal_offset       = character_code - start_character_of_line

        if start_character_of_line > last_start_character_of_line + CharactersPerLine: 
            sys.stdout.write("\n...")
        if start_character_of_line != last_start_character_of_line:
            sys.stdout.write("\n%05X: " % start_character_of_line)
            last_horizontal_offset = 0

        sys.stdout.write(" " * ColumnWidth * (horizontal_offset - last_horizontal_offset - 1))

        if Display == "hex":
            sys.stdout.write("%05X " % character_code)
        else:
            sys.stdout.write("%s " % map_unicode_to_utf8(character_code))

        last_start_character_of_line = start_character_of_line
        last_horizontal_offset       = horizontal_offset
def get_range_skipper(EndSequence, LanguageDB, MissingClosingDelimiterAction=""):
    assert EndSequence.__class__  == list
    assert len(EndSequence) >= 1
    assert map(type, EndSequence) == [int] * len(EndSequence)

    # Name the $$SKIPPER$$
    skipper_index = sm_index.get()

    # Determine the $$DELIMITER$$
    delimiter_str = ""
    delimiter_comment_str = "                         Delimiter: "
    for letter in EndSequence:
        delimiter_comment_str += "'%s', " % utf8.map_unicode_to_utf8(letter)
        delimiter_str += "0x%X, " % letter
    delimiter_length_str = "%i" % len(EndSequence)
    delimiter_comment_str = LanguageDB["$comment"](delimiter_comment_str) 

    # Determine the check for the tail of the delimiter
    delimiter_remainder_test_str = ""
    if len(EndSequence) != 1: 
        txt = ""
        i = 0
        for letter in EndSequence[1:]:
            i += 1
            txt += "    " + LanguageDB["$input/get-offset"](i-1) + "\n"
            txt += "    " + LanguageDB["$if !="]("Skipper$$SKIPPER_INDEX$$[%i]" % i)
            txt += "         " + LanguageDB["$goto"]("$entry", skipper_index) + "\n"
            txt += "    " + LanguageDB["$endif"]
        delimiter_remainder_test_str = txt

    # The main part
    code_str = blue_print(range_skipper_template,
                          [["$$DELIMITER$$",                  delimiter_str],
                           ["$$DELIMITER_LENGTH$$",           delimiter_length_str],
                           ["$$DELIMITER_COMMENT$$",          delimiter_comment_str],
                           ["$$WHILE_1_PLUS_1_EQUAL_2$$",     LanguageDB["$loop-start-endless"]],
                           ["$$END_WHILE$$",                  LanguageDB["$loop-end"]],
                           ["$$INPUT_P_INCREMENT$$",          LanguageDB["$input/increment"]],
                           ["$$INPUT_P_DECREMENT$$",          LanguageDB["$input/decrement"]],
                           ["$$INPUT_GET$$",                  LanguageDB["$input/get"]],
                           ["$$IF_INPUT_EQUAL_DELIMITER_0$$", LanguageDB["$if =="]("Skipper$$SKIPPER_INDEX$$[0]")],
                           ["$$BREAK$$",                      LanguageDB["$break"]],
                           ["$$ENDIF$$",                      LanguageDB["$endif"]],
                           ["$$ENTRY$$",                      LanguageDB["$label-def"]("$entry", skipper_index)],
                           ["$$DROP_OUT$$",                   LanguageDB["$label-def"]("$drop-out", skipper_index)],
                           ["$$GOTO_ENTRY$$",                 LanguageDB["$goto"]("$entry", skipper_index)],
                           ["$$GOTO_REENTRY_PREPARATION$$",   LanguageDB["$goto"]("$re-start")],
                           ["$$MARK_LEXEME_START$$",          LanguageDB["$mark-lexeme-start"]],
                           ["$$DELIMITER_REMAINDER_TEST$$",   delimiter_remainder_test_str],
                           ["$$SET_INPUT_P_BEHIND_DELIMITER$$", LanguageDB["$input/add"](len(EndSequence)-1)],
                           ["$$MISSING_CLOSING_DELIMITER$$",  MissingClosingDelimiterAction],
                          ])

    # Line and column number counting
    code_str = __range_skipper_lc_counting_replacements(code_str, EndSequence)

    # The finishing touch
    code_str = blue_print(code_str,
                          [["$$SKIPPER_INDEX$$", __nice(skipper_index)],
                           ["$$GOTO_DROP_OUT$$", LanguageDB["$goto"]("$drop-out", skipper_index)]])

    return code_str
Exemple #8
0
def get_range_skipper(EndSequence,
                      LanguageDB,
                      MissingClosingDelimiterAction=""):
    assert EndSequence.__class__ == list
    assert len(EndSequence) >= 1
    assert map(type, EndSequence) == [int] * len(EndSequence)

    # Name the $$SKIPPER$$
    skipper_index = sm_index.get()

    # Determine the $$DELIMITER$$
    delimiter_str = ""
    delimiter_comment_str = "                         Delimiter: "
    for letter in EndSequence:
        delimiter_comment_str += "'%s', " % utf8.map_unicode_to_utf8(letter)
        delimiter_str += "0x%X, " % letter
    delimiter_length_str = "%i" % len(EndSequence)
    delimiter_comment_str = LanguageDB["$comment"](delimiter_comment_str)

    # Determine the check for the tail of the delimiter
    delimiter_remainder_test_str = ""
    if len(EndSequence) != 1:
        txt = ""
        i = 0
        for letter in EndSequence[1:]:
            i += 1
            txt += "    " + LanguageDB["$input/get-offset"](i - 1) + "\n"
            txt += "    " + LanguageDB["$if !="](
                "Skipper$$SKIPPER_INDEX$$[%i]" % i)
            txt += "         " + LanguageDB["$goto"]("$entry",
                                                     skipper_index) + "\n"
            txt += "    " + LanguageDB["$endif"]
        delimiter_remainder_test_str = txt

    # The main part
    code_str = blue_print(range_skipper_template, [
        ["$$DELIMITER$$", delimiter_str],
        ["$$DELIMITER_LENGTH$$", delimiter_length_str],
        ["$$DELIMITER_COMMENT$$", delimiter_comment_str],
        ["$$WHILE_1_PLUS_1_EQUAL_2$$", LanguageDB["$loop-start-endless"]],
        ["$$END_WHILE$$", LanguageDB["$loop-end"]],
        ["$$INPUT_P_INCREMENT$$", LanguageDB["$input/increment"]],
        ["$$INPUT_P_DECREMENT$$", LanguageDB["$input/decrement"]],
        ["$$INPUT_GET$$", LanguageDB["$input/get"]],
        [
            "$$IF_INPUT_EQUAL_DELIMITER_0$$",
            LanguageDB["$if =="]("Skipper$$SKIPPER_INDEX$$[0]")
        ],
        ["$$BREAK$$", LanguageDB["$break"]],
        ["$$ENDIF$$", LanguageDB["$endif"]],
        ["$$ENTRY$$", LanguageDB["$label-def"]("$entry", skipper_index)],
        ["$$DROP_OUT$$", LanguageDB["$label-def"]("$drop-out", skipper_index)],
        ["$$GOTO_ENTRY$$", LanguageDB["$goto"]("$entry", skipper_index)],
        ["$$GOTO_REENTRY_PREPARATION$$", LanguageDB["$goto"]("$re-start")],
        ["$$MARK_LEXEME_START$$", LanguageDB["$mark-lexeme-start"]],
        ["$$DELIMITER_REMAINDER_TEST$$", delimiter_remainder_test_str],
        [
            "$$SET_INPUT_P_BEHIND_DELIMITER$$",
            LanguageDB["$input/add"](len(EndSequence) - 1)
        ],
        ["$$MISSING_CLOSING_DELIMITER$$", MissingClosingDelimiterAction],
    ])

    # Line and column number counting
    code_str = __range_skipper_lc_counting_replacements(code_str, EndSequence)

    # The finishing touch
    code_str = blue_print(code_str, [[
        "$$SKIPPER_INDEX$$", __nice(skipper_index)
    ], ["$$GOTO_DROP_OUT$$", LanguageDB["$goto"]("$drop-out", skipper_index)]])

    return code_str
def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:                        stream.seek(position); return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code     = None
    quote_checker = DoubleQuoteChecker() # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while char_code != 0xFF:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.")
        elif char_code == 0xFF: 
            raise RegularExpressionException("Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"): 
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [0xFF, ord(']')]: 
                raise RegularExpressionException("Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException("Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"): 
                char_code_2 = snap_backslashed_character.do(sh)  

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

    if tracker.negation_f: 
        result = tracker.match_set.inverse()
        if Setup.get_character_value_limit() != sys.maxint:
            result.intersect_with(Interval(0, Setup.get_character_value_limit()))
        return result
    else:                  
        return tracker.match_set
Exemple #10
0
def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:
            stream.seek(position)
            return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code = None
    quote_checker = DoubleQuoteChecker(
    )  # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while char_code != 0xFF:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException(
                "Character range operator '-' requires a preceding character as in 'a-z'."
            )
        elif char_code == 0xFF:
            raise RegularExpressionException(
                "Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"):
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [0xFF, ord(']')]:
                raise RegularExpressionException(
                    "Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException(
                    "Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"):
                char_code_2 = snap_backslashed_character.do(sh)

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

    if tracker.negation_f:
        result = tracker.match_set.inverse()
        if Setup.get_character_value_limit() != sys.maxint:
            result.intersect_with(
                Interval(0, Setup.get_character_value_limit()))
        return result
    else:
        return tracker.match_set