Beispiel #1
0
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))
Beispiel #3
0
def pump(LetterList):
    txt = ""
    LetterList.sort()
    for x in LetterList:
        if type(x) != list:
            txt += map_unicode_to_utf8(x) 
            txt += "(%04X)" % x
        else:
            for xe in x:
                txt += map_unicode_to_utf8(xe)
                txt += "(%04X)" % xe
        txt += ", "

    if len(txt) != 0: txt = txt[:-2]
    return txt
Beispiel #4
0
def pump(LetterList):
    txt = ""
    LetterList.sort()
    for x in LetterList:
        if type(x) != list:
            txt += map_unicode_to_utf8(x)
            txt += "(%04X)" % x
        else:
            for xe in x:
                txt += map_unicode_to_utf8(xe)
                txt += "(%04X)" % xe
        txt += ", "

    if len(txt) != 0: txt = txt[:-2]
    return txt
def test(the_state_machine, string_to_match):
    """Uses the given state machine to match against the 'string_to_match'."""
    print
    print "string = ", string_to_match
    letter_code_list = utf8.map_n_utf8_to_unicode(string_to_match)
    norm_db, x, x    = sm.get_state_index_normalization()
    state_index      = the_state_machine.init_state_index
    letter_n = -1
    for letter_code in letter_code_list:   
        letter_n += 1   
        if letter_n % 5 == 0: sys.stdout.write("\n")
        state_index = sm.states[state_index].target_map.get_resulting_target_state_index(letter_code) 
        sys.stdout.write("'%s' --> (%s), " % (utf8.map_unicode_to_utf8(letter_code), 
                                             repr(norm_db[state_index]).replace("L","")))
        if state_index == -1: break

    print
Beispiel #6
0
def __print_set_single_characters(CharSet, Display, ScreenWidth):
    assert Display in ["hex", "utf8"]

    if Display == "hex":
        CharactersPerLine = 8
        ColumnWidth = 6
    else:
        CharactersPerLine = 32
        ColumnWidth = 2

    # just to make sure ...

    character_list = CharacterList(CharSet)
    if character_list.is_empty():
        sys.stdout.write("<Result = Empty Character Set>\n")
        return

    # Avoid memory overflow for very large sets: get character by character
    last_start_character_of_line = -1
    last_horizontal_offset = 0
    while 1 + 1 == 2:
        character_code = character_list.next()
        if character_code is None: break

        start_character_of_line = character_code - character_code % CharactersPerLine
        horizontal_offset = character_code - start_character_of_line

        if start_character_of_line > last_start_character_of_line + CharactersPerLine:
            sys.stdout.write("\n...")
        if start_character_of_line != last_start_character_of_line:
            sys.stdout.write("\n%05X: " % start_character_of_line)
            last_horizontal_offset = 0

        sys.stdout.write(" " * ColumnWidth *
                         (horizontal_offset - last_horizontal_offset - 1))

        if Display == "hex":
            sys.stdout.write("%05X " % character_code)
        else:
            if character_code >= 0x20:
                sys.stdout.write("%s " % map_unicode_to_utf8(character_code))
            else:
                sys.stdout.write("? ")

        last_start_character_of_line = start_character_of_line
        last_horizontal_offset = horizontal_offset
Beispiel #7
0
def __print_set_single_characters(CharSet, Display, ScreenWidth):
    assert Display in ["hex", "utf8"]

    if Display == "hex":
        CharactersPerLine = 8
        ColumnWidth       = 6
    else:
        CharactersPerLine = 32
        ColumnWidth       = 2

    # just to make sure ...

    character_list = CharacterList(CharSet)
    if character_list.is_empty():
        sys.stdout.write("<Result = Empty Character Set>\n")
        return

    # Avoid memory overflow for very large sets: get character by character 
    last_start_character_of_line = -1
    last_horizontal_offset       = 0
    while 1 + 1 == 2:
        character_code = character_list.next()
        if character_code is None: break

        start_character_of_line = character_code - character_code % CharactersPerLine
        horizontal_offset       = character_code - start_character_of_line

        if start_character_of_line > last_start_character_of_line + CharactersPerLine: 
            sys.stdout.write("\n...")
        if start_character_of_line != last_start_character_of_line:
            sys.stdout.write("\n%05X: " % start_character_of_line)
            last_horizontal_offset = 0

        sys.stdout.write(" " * ColumnWidth * (horizontal_offset - last_horizontal_offset - 1))

        if Display == "hex":
            sys.stdout.write("%05X " % character_code)
        else:
            if character_code >= 0x20:
                sys.stdout.write("%s " % map_unicode_to_utf8(character_code))
            else:
                sys.stdout.write("? ")

        last_start_character_of_line = start_character_of_line
        last_horizontal_offset       = horizontal_offset
Beispiel #8
0
def test(the_state_machine, string_to_match):
    """Uses the given state machine to match against the 'string_to_match'."""
    print
    print "string = ", string_to_match
    letter_code_list = utf8.map_n_utf8_to_unicode(string_to_match)
    norm_db, x, x = sm.get_state_index_normalization()
    state_index = the_state_machine.init_state_index
    letter_n = -1
    for letter_code in letter_code_list:
        letter_n += 1
        if letter_n % 5 == 0: sys.stdout.write("\n")
        state_index = sm.states[
            state_index].target_map.get_resulting_target_state_index(
                letter_code)
        sys.stdout.write("'%s' --> (%s), " %
                         (utf8.map_unicode_to_utf8(letter_code),
                          repr(norm_db[state_index]).replace("L", "")))
        if state_index == -1: break

    print
Beispiel #9
0
def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:
            stream.seek(position)
            return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code = None
    quote_checker = DoubleQuoteChecker(
    )  # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while 1 + 1 == 2:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException(
                "Character range operator '-' requires a preceding character as in 'a-z'."
            )
        elif char_code is None:
            raise RegularExpressionException(
                "Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"):
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [None, ord(']')]:
                raise RegularExpressionException(
                    "Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException(
                    "Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"):
                char_code_2 = snap_backslashed_character.do(sh)

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

        if char_code is None: break

    if tracker.negation_f:
        return tracker.match_set.get_complement(
            Setup.buffer_encoding.source_set)
    else:
        return tracker.match_set
def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:                        stream.seek(position); return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code     = None
    quote_checker = DoubleQuoteChecker() # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while 1 + 1 == 2:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.")
        elif char_code is None: 
            raise RegularExpressionException("Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"): 
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [None, ord(']')]: 
                raise RegularExpressionException("Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException("Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"): 
                char_code_2 = snap_backslashed_character.do(sh)  

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

        if char_code is None: break

    if tracker.negation_f: 
        return tracker.match_set.get_complement(Setup.buffer_codec.source_set)
    else:                  
        return tracker.match_set
Beispiel #11
0
#! /usr/bin/env python
import sys
import os
import StringIO
sys.path.insert(0, os.environ["QUEX_PATH"])

import quex.engine.misc.utf8 as utf8

if "--hwut-info" in sys.argv:
    print "UTF8: Map UTF8 String To Unicode Values"
    sys.exit(0)

# the unequal sign from the utf-8 manpage
txt = chr(0xE2) + chr(0x89) + chr(0xA0)
# unicode character (C) from the utf-8 man-page
txt += chr(0xC2) + chr(0xA9)
# unicode character 'Lam' in the arabic code page
txt += chr(0xd9) + chr(0x84)
cstr = StringIO.StringIO(txt)

print "unequal-sign:   E2.89.A0 -> %x" % utf8.map_utf8_to_unicode(cstr)
print "copyright-sign: C2.A9    -> %x" % utf8.map_utf8_to_unicode(cstr)
print "arabic lam:     D9.84    -> %x" % utf8.map_utf8_to_unicode(cstr)
# print map(lambda x: "%x" % x, utf8.read(cstr,10000))

print "%x -> %s" % (0x2260, utf8.map_unicode_to_utf8(0x2260))
print "%x -> %s" % (0xA9, utf8.map_unicode_to_utf8(0xA9))
print "%x -> %s" % (0x644, utf8.map_unicode_to_utf8(0x644))
#! /usr/bin/env python
import sys
import os
import StringIO
sys.path.insert(0, os.environ["QUEX_PATH"])

import quex.engine.misc.utf8            as utf8

if "--hwut-info" in sys.argv:
    print "UTF8: Map UTF8 String To Unicode Values"
    sys.exit(0)
    
# the unequal sign from the utf-8 manpage
txt = chr(0xE2) + chr(0x89) + chr(0xA0)
# unicode character (C) from the utf-8 man-page
txt += chr(0xC2) + chr(0xA9)
# unicode character 'Lam' in the arabic code page
txt += chr(0xd9) + chr(0x84) 
cstr = StringIO.StringIO(txt)

print "unequal-sign:   E2.89.A0 -> %x" % utf8.map_utf8_to_unicode(cstr)
print "copyright-sign: C2.A9    -> %x" % utf8.map_utf8_to_unicode(cstr)
print "arabic lam:     D9.84    -> %x" % utf8.map_utf8_to_unicode(cstr)
# print map(lambda x: "%x" % x, utf8.read(cstr,10000))

print "%x -> %s" % (0x2260, utf8.map_unicode_to_utf8(0x2260))
print "%x -> %s" % (0xA9,   utf8.map_unicode_to_utf8(0xA9))
print "%x -> %s" % (0x644,  utf8.map_unicode_to_utf8(0x644))


Beispiel #13
0
        byte1 = value >> 8
        assert value & 0xFFFF0000 == 0, "Value = %08X" % value
        result += chr(byte0) + chr(byte1)
    return result

# (1) Critical Borders for UTF8: 
#     0x80, 0x800, 0x10000, 0x10ffff (last code element)
# (2) Critical Borders for UTF16: 
#     0xD800, 0xE000
unicode_character_list =   range(0x01, 0x7F)            \
                         + range(0x800, 0x800 +10)      \
                         + range(0x10000, 0x10000 + 10) 

utf8_list    = ""
utf16_list   = ""
unicode_list = ""
for char in unicode_character_list:
    utf8_list    += utf8.map_unicode_to_utf8(char)
    utf16_list   += word_4_split(utf16.unicode_to_utf16(char))
    unicode_list += byte_4_split(char)

fh = open("example/utf8.txt", "wb")
fh.write(utf8_list)
fh.close()
fh = open("example/utf16le.txt", "wb")
fh.write("\xff\xfe" + utf16_list) # BOM: Little Endian
fh.close()
fh = open("example/ucs4le.txt", "wb")
fh.write(unicode_list)
fh.close()