Esempi in Python per unicode_to_utf8, esempi in Python per quex.engine.misc.utf8.unicode_to_utf8

Esempio n. 1

0

Mostra file

def _split_by_transformed_sequence_length(X):
    """Split Unicode interval into intervals where all values have the same 
    utf8-byte sequence length.

    RETURNS: map: sequence length --> Unicode Sub-Interval of X.
    """
    if X.begin < 0:         X.begin = 0
    if X.end   > UTF8_MAX:  X.end   = UTF8_MAX + 1

    if X.size() == 0: return None

    db = {}
    current_begin = X.begin
    last_L        = len(unicode_to_utf8(X.end - 1))  # Length of utf8 sequence corresponding
    #                                                # the last value inside the interval.
    while 1 + 1 == 2:
        L = len(unicode_to_utf8(current_begin))   # Length of the first unicode in utf8
        # Store the interval together with the required byte sequence length (as key)
        current_end = UTF8_BORDERS[L-1]
        if L == last_L: 
            db[L] = Interval(current_begin, X.end)
            break
        db[L] = Interval(current_begin, current_end)
        current_begin = current_end

    return db

Esempio n. 2

0

Mostra file

File: utf8_state_split.py Progetto: nyulacska/gpr

def get_trigger_sequence_for_contigous_byte_range_interval(X, L):
    front_sequence = unicode_to_utf8(X.begin)
    back_sequence = unicode_to_utf8(X.end - 1)
    # If the interval is contigous it must produce equal length utf8 sequences

    return [
        Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L)
    ]

Esempio n. 3

0

Mostra file

File: utf8_state_split.py Progetto: mplucinski/quex

def homogeneous_chunk_n_per_character(CharacterSet):
    """If all characters in a unicode character set state machine require the
    same number of bytes to be represented this number is returned.  Otherwise,
    'None' is returned.

    RETURNS:   N > 0  number of bytes required to represent any character in the 
                      given state machine.
               None   characters in the state machine require different numbers of
                      bytes.
    """
    assert isinstance(CharacterSet, NumberSet)

    interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True)
    front = interval_list[0].begin     # First element of number set
    back  = interval_list[-1].end - 1  # Last element of number set
    # Determine number of bytes required to represent the first and the 
    # last character of the number set. The number of bytes per character
    # increases monotonously, so only borders have to be considered.
    front_chunk_n = len(unicode_to_utf8(front))
    back_chunk_n  = len(unicode_to_utf8(back))
    if front_chunk_n != back_chunk_n: return None
    else:                             return front_chunk_n

Esempio n. 4

0

Mostra file

    def lexatom_n_per_character(self, CharacterSet):
        """If all characters in a unicode character set state machine require the
        same number of bytes to be represented this number is returned.  Otherwise,
        'None' is returned.

        RETURNS:   N > 0  number of bytes required to represent any character in the 
                          given state machine.
                   None   characters in the state machine require different numbers of
                          bytes.
        """
        assert isinstance(CharacterSet, NumberSet)

        interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True)
        front = interval_list[0].begin     # First element of number set
        back  = interval_list[-1].end - 1  # Last element of number set
        # Determine number of bytes required to represent the first and the 
        # last character of the number set. The number of bytes per character
        # increases monotonously, so only borders have to be considered.
        front_chunk_n = len(unicode_to_utf8(front))
        back_chunk_n  = len(unicode_to_utf8(back))
        if front_chunk_n != back_chunk_n: return None
        else:                             return front_chunk_n

Esempio n. 5

0

Mostra file

File: test-utf8-byte-sequences.py Progetto: xxyzzzq/quex

def test(UC):
    global error_n
    correct = reference_utf8_encoder(UC)
    output = unicode_to_utf8(UC)

    if correct != output:
        print "ERROR: unicode_to_utf8 with %06X" % UC
        print correct
        print output
        error_n += 1

    backward = utf8_to_unicode(correct)

    if backward != UC:
        print "ERROR: utf8_to_unicode with %06X" % UC
        error_n += 1

Esempio n. 6

0

Mostra file

File: test-utf8-byte-sequences.py Progetto: mplucinski/quex

def test(UC):
    global error_n
    correct = reference_utf8_encoder(UC)
    output  = unicode_to_utf8(UC)

    if correct != output:
        print "ERROR: unicode_to_utf8 with %06X" % UC
        print correct
        print output
        error_n += 1

    backward = utf8_to_unicode(correct)

    if backward != UC:
       print "ERROR: utf8_to_unicode with %06X" % UC
       error_n += 1

Esempio n. 7

0

Mostra file

File: utf8_state_split.py Progetto: mplucinski/quex

def get_trigger_sequence_for_contigous_byte_range_interval(X, L):
    front_sequence = unicode_to_utf8(X.begin)
    back_sequence  = unicode_to_utf8(X.end - 1)
    # If the interval is contigous it must produce equal length utf8 sequences

    return [ Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L) ]

Esempio n. 8

0

Mostra file

File: utf8_state_split.py Progetto: mplucinski/quex

def split_interval_into_contigous_byte_sequence_range(X, L):
    """Use the fact that utf8 byte sequences of increasing unicode values relate
       to increasing byte sequence values. Consider the unicode interval [0x12345,
       0x17653]. 
       
                    Unicode   UTF8-byte sequence

                    012345    F0.92.8D.85
                              ...
                    01237F    F0.92.8D.BF
                    012380    F0.92.8E.80
                              ...
                    012FFF    F0.92.BF.BF
                    013000    F0.93.80.80
                              ...
                    016FFF    F0.96.BF.BF
                    017000    F0.97.80.80
                              ...
                    01763F    F0.97.98.BF
                    017640    F0.97.99.80
                              ...
                    017653    F0.97.99.93

       
       The utf8 sequences of the values in the sub-interval [0x12345, 0x1237F]
       only differ with respect to the last byte, but they all trigger to the
       'original targte state', so they can be combined into a trigger sequence

                 [F0, 92, 8D, [85,BF]]

       Analogously, the values in [0x12FFF, 0x13000] differ only with respect
       to the last two bytes. But, all trigger with 2x [80, BF] to the original
       target state. So, they can be combined to the original target state, thus
       they can be combined to

                 [F0, 92, [80,BF], [80,BF]]

       A contigous interval is an interval where such combinations are valid.
       This function splits a given interval into such intervals.

       REQUIRES: The byte sequence in the given interval **must** have all the same 
                 length L.

       RETURNS: List of 'contigous' intervals and the index of the first byte
                where all sequences differ.
    """
    # A byte in a utf8 sequence can only have a certain range depending
    # on its position. UTF8 sequences look like the following dependent
    # on their length:
    #
    #       Length:   Byte Masks for each byte
    #
    #       1 byte    0xxxxxxx
    #       2 bytes   110xxxxx 10xxxxxx
    #       3 bytes   1110xxxx 10xxxxxx 10xxxxxx
    #       4 bytes   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    #       5 bytes   ...
    #
    # where 'free' bits are indicated by 'x'. 
    # Min. value of a byte = where all 'x' are zero.
    # Max. value of a byte = where all 'x' are 1.
    # 
    def min_byte_value(ByteIndex):
        assert L <= 6
        if ByteIndex != 0: return 0x80
        # Only first byte's range depends on length
        return { 0: 0x00, 1: 0xC0, 2: 0xE0, 3: 0xF0, 4: 0xF8, 5: 0xFC }[L]

    def max_byte_value(ByteIndex):
        assert L <= 6
        if ByteIndex != 0: return 0xBF
        # Only first byte's range depends on length
        return { 0: 0x7F, 1: 0xDF, 2: 0xEF, 3: 0xF7, 4: 0xFB, 5: 0xFD }[L]
       
    def find_first_diff_byte(front_sequence, back_sequence):
        # Find the first byte that is different in the front and back sequence 
        for i in range(L-1):
            if front_sequence[i] != back_sequence[i]: return i
        # At least the last byte must be different. That's why it **must** be the
        # one different if no previous byte was it.
        return L - 1

    assert X.size() != 0

    if X.size() == 1: return [ X ], 0
    # If the utf8 sequence consist of one byte, then the range cannot be split.
    if L == 1: return [ X ], 0

    front_sequence = unicode_to_utf8(X.begin)
    back_sequence  = unicode_to_utf8(X.end - 1)
    p      = find_first_diff_byte(front_sequence, back_sequence)
    result = []
    current_begin = X.begin
    byte_sequence = copy(front_sequence)
    byte_indeces  = range(p + 1, L)
    byte_indeces.reverse()
    for q in byte_indeces:
        # There **must** be at least one overrun, even for 'q=p+1', since 'p+1' 
        # indexes the first byte after the first byte that was different. If 'p' 
        # indexed that last byte this block is never entered.
        byte_sequence[q] = max_byte_value(q)
        current_end      = utf8_to_unicode(byte_sequence) + 1
        result.append(Interval(current_begin, current_end))
        current_begin    = current_end

    if front_sequence[p] + 1 != back_sequence[p]:
        if p == L - 1: byte_sequence[p] = back_sequence[p]
        else:          byte_sequence[p] = back_sequence[p] - 1 
        current_end      = utf8_to_unicode(byte_sequence) + 1
        result.append(Interval(current_begin, current_end))
        current_begin    = current_end

    byte_sequence[p] = back_sequence[p]
    for q in range(p + 1, L):
        if back_sequence[q] == min_byte_value(q):
            byte_sequence[q] = back_sequence[q]
        else:
            if q == L - 1: byte_sequence[q] = back_sequence[q] 
            else:          byte_sequence[q] = back_sequence[q] - 1
            current_end      = utf8_to_unicode(byte_sequence) + 1
            result.append(Interval(current_begin, current_end))
            if current_begin == X.end: break
            current_begin    = current_end
            byte_sequence[q] = back_sequence[q]

    if current_begin != X.end:
        result.append(Interval(current_begin, X.end))

    return result, p

Esempio n. 9

0

Mostra file

File: test-utf8-boarders.py Progetto: xxyzzzq/quex

    print "CHOICES:    error-detect, plain;"
    sys.exit()

if "error-detect" in sys.argv:
    Setup.bad_lexatom_detection_f = True
else:
    Setup.bad_lexatom_detection_f = False

boarders = [ 
    0x00000080, 0x000007FF, 0x00000800, 0x0000FFFF, 
    0x00010000, 0x001FFFFF, 0x00200000, 0x03FFFFFF,
    0x04000000, 0x7FFFFFFF
]

good_sequences = [
    unicode_to_utf8(x) for x in boarders
]

# Boarders of code unit ragnes which are encoding errors:
bad_byte0s = [ 0x80, 0xBF, 0xFE, 0xFF ] # boarders of disallowed Byte[0]
bad_byteNs = [ 0x00, 0x7F, 0xC0, 0xFF ] # boarders of disallowed Byte[>0]

sm = helper.generate_sm_for_boarders(boarders, EncodingTrafoUTF8())

bad_sequence_list = helper.get_bad_sequences(good_sequences, bad_byte0s, bad_byteNs)

if True:
    helper.test_good_and_bad_sequences(sm, good_sequences, bad_sequence_list)

else:
    # Check on isolated sequence (debugging)

Esempio n. 10

0

Mostra file

def _get_contiguous_interval_sequences(X, L):
    """
    A contiguous interval in the domain is an interval where all 'N' first
    lexatoms in the range the same. The last 'N-2' lexatoms cover the whole
    range of lexatom values. The lexatoms at 'N-2' are all adjacent. 'N' may
    range from 1 to 'max. length of lexatom + 1'. In the case 'N=max. length of
    lexatom + 1' only the last by covers a range (if it does).
    
    EXAMPLE: UTF8 sequences related to the unicode interval [0x12345, 0x17653]. 
       
                                                 Sequence Description in 
          Unicode:  UTF8-byte sequence:          Contiguous Interval:

           012345    F0.92.8D.85        ----.
                     ...                    |=>  F0.92.8D.[85-BF]
           01237F    F0.92.8D.BF        ----'   
           012380    F0.92.8E.80        ----.   
                     ...                    |=>  F0.92.[8E-BF].[80-BF]
           012FFF    F0.92.BF.BF        ----'   
           013000    F0.93.80.80        ----.   
                     ...                    |=>  F0.[93-96].[8E-BF].[80-BF]
           016FFF    F0.96.BF.BF        ----'   
           017000    F0.97.80.80        ----.   
                     ...                    |=>  F0.97.[80-98].[80-BF]
           01763F    F0.97.98.BF        ----'   
           017640    F0.97.99.80        ----.   
                     ...                    |=>  F0.97.99.[80-93]
           017653    F0.97.99.93        ----'

    This function splits a given interval into such intervals. Providing such
    intervals and implementing it in state sequences avoids a complex hopcroft 
    minimization after the transformation.

    REQUIRES: The byte sequence in the given interval **must** have all the same 
              length L.

    RETURNS: List of 'contigous' intervals and the index of the first byte
             where all sequences differ.
    """
    # A byte in a utf8 sequence can only have a certain range depending
    # on its position. UTF8 sequences look like the following dependent
    # on their length:
    #
    #       Length:   Byte Masks for each byte
    #
    #       1 byte    0xxxxxxx
    #       2 bytes   110xxxxx 10xxxxxx
    #       3 bytes   1110xxxx 10xxxxxx 10xxxxxx
    #       4 bytes   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    #       5 bytes   ...
    #
    # where 'free' bits are indicated by 'x'. 
    # Min. value of a byte = where all 'x' are zero.
    # Max. value of a byte = where all 'x' are 1.
    # 
    def min_byte_value(L, ByteIndex):
        assert L <= 6
        if ByteIndex != 0: return 0x80
        # Only first byte's range depends on length
        return { 0: 0x00, 1: 0xC0, 2: 0xE0, 3: 0xF0, 4: 0xF8, 5: 0xFC }[L]

    def max_byte_value(L, ByteIndex):
        assert L <= 6
        if ByteIndex != 0: return 0xBF
        # Only first byte's range depends on length
        return { 0: 0x7F, 1: 0xDF, 2: 0xEF, 3: 0xF7, 4: 0xFB, 5: 0xFD }[L]
       
    def find_first_diff_byte(front_sequence, back_sequence):
        # Find the first byte that is different in the front and back sequence 
        for i in range(L-1):
            if front_sequence[i] != back_sequence[i]: return i
        # At least the last byte must be different. That's why it **must** be the
        # one different if no previous byte was it.
        return L - 1

    assert X.size() != 0

    # Interval's size = 1 character      --> no split
    if X.size() == 1: return [ X ]
    # Resulting utf8 sequence length = 1 --> no split 
    elif L == 1: return [ X ]

    # Utf8 Sequences representing first and last element in interval 'X'.
    front_sequence = unicode_to_utf8(X.begin)
    back_sequence  = unicode_to_utf8(X.end - 1)

    p      = find_first_diff_byte(front_sequence, back_sequence)
    result = []
    current_begin = X.begin
    byte_sequence = copy(front_sequence)
    byte_indeces  = range(p + 1, L)
    byte_indeces.reverse()
    for q in byte_indeces:
        # There **must** be at least one overrun, even for 'q=p+1', since 'p+1' 
        # indexes the first byte after the first byte that was different. If 'p' 
        # indexed that last byte this block is never entered.
        byte_sequence[q] = max_byte_value(L, q)
        current_end      = utf8_to_unicode(byte_sequence) + 1
        result.append(Interval(current_begin, current_end))
        current_begin    = current_end

    if front_sequence[p] + 1 != back_sequence[p]:
        if p == L - 1: byte_sequence[p] = back_sequence[p]
        else:          byte_sequence[p] = back_sequence[p] - 1 
        current_end      = utf8_to_unicode(byte_sequence) + 1
        result.append(Interval(current_begin, current_end))
        current_begin    = current_end

    byte_sequence[p] = back_sequence[p]
    for q in range(p + 1, L):
        if back_sequence[q] == min_byte_value(L, q):
            byte_sequence[q] = back_sequence[q]
        else:
            if q == L - 1: byte_sequence[q] = back_sequence[q] 
            else:          byte_sequence[q] = back_sequence[q] - 1
            current_end      = utf8_to_unicode(byte_sequence) + 1
            result.append(Interval(current_begin, current_end))
            if current_begin == X.end: break
            current_begin    = current_end
            byte_sequence[q] = back_sequence[q]

    if current_begin != X.end:
        result.append(Interval(current_begin, X.end))

    return result

Esempio n. 11

0

Mostra file

File: test-utf8-boarders.py Progetto: xxyzzzq/quex

if "--hwut-info" in sys.argv:
    print "UTF8 Split: Repetition at Codec Boarders"
    print "CHOICES:    error-detect, plain;"
    sys.exit()

if "error-detect" in sys.argv:
    Setup.bad_lexatom_detection_f = True
else:
    Setup.bad_lexatom_detection_f = False

boarders = [
    0x00000080, 0x000007FF, 0x00000800, 0x0000FFFF, 0x00010000, 0x001FFFFF,
    0x00200000, 0x03FFFFFF, 0x04000000, 0x7FFFFFFF
]

good_sequences = [unicode_to_utf8(x) for x in boarders]

# Boarders of code unit ragnes which are encoding errors:
bad_byte0s = [0x80, 0xBF, 0xFE, 0xFF]  # boarders of disallowed Byte[0]
bad_byteNs = [0x00, 0x7F, 0xC0, 0xFF]  # boarders of disallowed Byte[>0]

sm = helper.generate_sm_for_boarders(boarders, EncodingTrafoUTF8())

bad_sequence_list = helper.get_bad_sequences(good_sequences, bad_byte0s,
                                             bad_byteNs)

if True:
    helper.test_good_and_bad_sequences(sm, good_sequences, bad_sequence_list)

else:
    # Check on isolated sequence (debugging)

Esempio n. 12

0

Mostra file

File: utf8_state_split.py Progetto: nyulacska/gpr

def split_interval_into_contigous_byte_sequence_range(X, L):
    """Use the fact that utf8 byte sequences of increasing unicode values relate
       to increasing byte sequence values. Consider the unicode interval [0x12345,
       0x17653]. 
       
                    Unicode   UTF8-byte sequence

                    012345    F0.92.8D.85
                              ...
                    01237F    F0.92.8D.BF
                    012380    F0.92.8E.80
                              ...
                    012FFF    F0.92.BF.BF
                    013000    F0.93.80.80
                              ...
                    016FFF    F0.96.BF.BF
                    017000    F0.97.80.80
                              ...
                    01763F    F0.97.98.BF
                    017640    F0.97.99.80
                              ...
                    017653    F0.97.99.93

       
       The utf8 sequences of the values in the sub-interval [0x12345, 0x1237F]
       only differ with respect to the last byte, but they all trigger to the
       'original targte state', so they can be combined into a trigger sequence

                 [F0, 92, 8D, [85,BF]]

       Analogously, the values in [0x12FFF, 0x13000] differ only with respect
       to the last two bytes. But, all trigger with 2x [80, BF] to the original
       target state. So, they can be combined to the original target state, thus
       they can be combined to

                 [F0, 92, [80,BF], [80,BF]]

       A contigous interval is an interval where such combinations are valid.
       This function splits a given interval into such intervals.

       REQUIRES: The byte sequence in the given interval **must** have all the same 
                 length L.

       RETURNS: List of 'contigous' intervals and the index of the first byte
                where all sequences differ.
    """

    # A byte in a utf8 sequence can only have a certain range depending
    # on its position. UTF8 sequences look like the following dependent
    # on their length:
    #
    #       Length:   Byte Masks for each byte
    #
    #       1 byte    0xxxxxxx
    #       2 bytes   110xxxxx 10xxxxxx
    #       3 bytes   1110xxxx 10xxxxxx 10xxxxxx
    #       4 bytes   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    #       5 bytes   ...
    #
    # where 'free' bits are indicated by 'x'.
    # Min. value of a byte = where all 'x' are zero.
    # Max. value of a byte = where all 'x' are 1.
    #
    def min_byte_value(ByteIndex):
        assert L <= 6
        if ByteIndex != 0: return 0x80
        # Only first byte's range depends on length
        return {0: 0x00, 1: 0xC0, 2: 0xE0, 3: 0xF0, 4: 0xF8, 5: 0xFC}[L]

    def max_byte_value(ByteIndex):
        assert L <= 6
        if ByteIndex != 0: return 0xBF
        # Only first byte's range depends on length
        return {0: 0x7F, 1: 0xDF, 2: 0xEF, 3: 0xF7, 4: 0xFB, 5: 0xFD}[L]

    def find_first_diff_byte(front_sequence, back_sequence):
        # Find the first byte that is different in the front and back sequence
        for i in range(L - 1):
            if front_sequence[i] != back_sequence[i]: return i
        # At least the last byte must be different. That's why it **must** be the
        # one different if no previous byte was it.
        return L - 1

    assert X.size() != 0

    if X.size() == 1: return [X], 0
    # If the utf8 sequence consist of one byte, then the range cannot be split.
    if L == 1: return [X], 0

    front_sequence = unicode_to_utf8(X.begin)
    back_sequence = unicode_to_utf8(X.end - 1)
    p = find_first_diff_byte(front_sequence, back_sequence)
    result = []
    current_begin = X.begin
    byte_sequence = copy(front_sequence)
    byte_indeces = range(p + 1, L)
    byte_indeces.reverse()
    for q in byte_indeces:
        # There **must** be at least one overrun, even for 'q=p+1', since 'p+1'
        # indexes the first byte after the first byte that was different. If 'p'
        # indexed that last byte this block is never entered.
        byte_sequence[q] = max_byte_value(q)
        current_end = utf8_to_unicode(byte_sequence) + 1
        result.append(Interval(current_begin, current_end))
        current_begin = current_end

    if front_sequence[p] + 1 != back_sequence[p]:
        if p == L - 1: byte_sequence[p] = back_sequence[p]
        else: byte_sequence[p] = back_sequence[p] - 1
        current_end = utf8_to_unicode(byte_sequence) + 1
        result.append(Interval(current_begin, current_end))
        current_begin = current_end

    byte_sequence[p] = back_sequence[p]
    for q in range(p + 1, L):
        if back_sequence[q] == min_byte_value(q):
            byte_sequence[q] = back_sequence[q]
        else:
            if q == L - 1: byte_sequence[q] = back_sequence[q]
            else: byte_sequence[q] = back_sequence[q] - 1
            current_end = utf8_to_unicode(byte_sequence) + 1
            result.append(Interval(current_begin, current_end))
            if current_begin == X.end: break
            current_begin = current_end
            byte_sequence[q] = back_sequence[q]

    if current_begin != X.end:
        result.append(Interval(current_begin, X.end))

    return result, p