Ejemplo n.º 1
0
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))
Ejemplo n.º 2
0
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))
Ejemplo n.º 3
0
def __print_set_single_characters(CharSet, Display, ScreenWidth):
    assert Display in ["hex", "utf8"]

    if Display == "hex":
        CharactersPerLine = 8
        ColumnWidth = 6
    else:
        CharactersPerLine = 32
        ColumnWidth = 2

    # just to make sure ...

    character_list = CharacterList(CharSet)
    if character_list.is_empty():
        sys.stdout.write("<Result = Empty Character Set>\n")
        return

    # Avoid memory overflow for very large sets: get character by character
    last_start_character_of_line = -1
    last_horizontal_offset = 0
    while 1 + 1 == 2:
        character_code = character_list.next()
        if character_code is None: break

        start_character_of_line = character_code - character_code % CharactersPerLine
        horizontal_offset = character_code - start_character_of_line

        if start_character_of_line > last_start_character_of_line + CharactersPerLine:
            sys.stdout.write("\n...")
        if start_character_of_line != last_start_character_of_line:
            sys.stdout.write("\n%05X: " % start_character_of_line)
            last_horizontal_offset = 0

        sys.stdout.write(" " * ColumnWidth *
                         (horizontal_offset - last_horizontal_offset - 1))

        if Display == "hex":
            sys.stdout.write("%05X " % character_code)
        else:
            if character_code >= 0x20:
                sys.stdout.write("%s " % map_unicode_to_utf8(character_code))
            else:
                sys.stdout.write("? ")

        last_start_character_of_line = start_character_of_line
        last_horizontal_offset = horizontal_offset
Ejemplo n.º 4
0
def __print_set_single_characters(CharSet, Display, ScreenWidth):
    assert Display in ["hex", "utf8"]

    if Display == "hex":
        CharactersPerLine = 8
        ColumnWidth       = 6
    else:
        CharactersPerLine = 32
        ColumnWidth       = 2

    # just to make sure ...

    character_list = CharacterList(CharSet)
    if character_list.is_empty():
        sys.stdout.write("<Result = Empty Character Set>\n")
        return

    # Avoid memory overflow for very large sets: get character by character 
    last_start_character_of_line = -1
    last_horizontal_offset       = 0
    while 1 + 1 == 2:
        character_code = character_list.next()
        if character_code is None: break

        start_character_of_line = character_code - character_code % CharactersPerLine
        horizontal_offset       = character_code - start_character_of_line

        if start_character_of_line > last_start_character_of_line + CharactersPerLine: 
            sys.stdout.write("\n...")
        if start_character_of_line != last_start_character_of_line:
            sys.stdout.write("\n%05X: " % start_character_of_line)
            last_horizontal_offset = 0

        sys.stdout.write(" " * ColumnWidth * (horizontal_offset - last_horizontal_offset - 1))

        if Display == "hex":
            sys.stdout.write("%05X " % character_code)
        else:
            if character_code >= 0x20:
                sys.stdout.write("%s " % map_unicode_to_utf8(character_code))
            else:
                sys.stdout.write("? ")

        last_start_character_of_line = start_character_of_line
        last_horizontal_offset       = horizontal_offset
Ejemplo n.º 5
0
def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:
            stream.seek(position)
            return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code = None
    quote_checker = DoubleQuoteChecker(
    )  # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while 1 + 1 == 2:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException(
                "Character range operator '-' requires a preceding character as in 'a-z'."
            )
        elif char_code is None:
            raise RegularExpressionException(
                "Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"):
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [None, ord(']')]:
                raise RegularExpressionException(
                    "Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException(
                    "Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"):
                char_code_2 = snap_backslashed_character.do(sh)

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

        if char_code is None: break

    if tracker.negation_f:
        return tracker.match_set.get_complement(Setup.buffer_codec.source_set)
    else:
        return tracker.match_set
Ejemplo n.º 6
0
def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:                        stream.seek(position); return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code     = None
    quote_checker = DoubleQuoteChecker() # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while 1 + 1 == 2:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.")
        elif char_code is None: 
            raise RegularExpressionException("Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"): 
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [None, ord(']')]: 
                raise RegularExpressionException("Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException("Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"): 
                char_code_2 = snap_backslashed_character.do(sh)  

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

        if char_code is None: break

    if tracker.negation_f: 
        return tracker.match_set.get_complement(Setup.buffer_codec.source_set)
    else:                  
        return tracker.match_set