Example #1
0
def _split_contigous_intervals_for_surrogates(Begin, End):
    """Splits the interval X into sub interval so that no interval runs over a 'surrogate'
       border of the last word. For that, it is simply checked if the End falls into the
       same 'surrogate' domain of 'front' (start value of front = Begin). If it does not
       an interval [front, end_of_domain) is split up and front is set to end of domain.
       This procedure repeats until front and End lie in the same domain.
    """
    global ForbiddenRange
    assert Begin >= 0x10000
    assert End   <= 0x110000
    assert End   > Begin

    front_seq = unicode_to_utf16(Begin)
    back_seq  = unicode_to_utf16(End - 1)

    # (*) First word is the same.
    #     Then,
    #       -- it is either a one word character.
    #       -- it is a range of two word characters, but the range 
    #          extends in one contigous range in the second surrogate.
    #     In both cases, the interval is contigous.
    if front_seq[0] == back_seq[0]:
        return [Interval(Begin, End)]

    # (*) First word is NOT the same
    # Separate into three domains:
    #
    # (1) Interval from Begin until second surrogate hits border 0xE000
    # (2) Interval where the first surrogate inreases while second 
    #     surrogate iterates over [0xDC00, 0xDFFF]
    # (3) Interval from begin of last surrogate border to End
    result = []
    end    = utf16_to_unicode([front_seq[0], ForbiddenRange.end - 1]) + 1

    
    # (1) 'Begin' until second surrogate hits border 0xE000
    #      (The following **must** hold according to entry condition about 
    #       front and back sequence.)
    assert End > end
    result.append(Interval(Begin, end))

    if front_seq[0] + 1 != back_seq[0]: 
        # (2) Second surrogate iterates over [0xDC00, 0xDFFF]
        mid_end = utf16_to_unicode([back_seq[0] - 1, ForbiddenRange.end - 1]) + 1
        #     (The following **must** hold according to entry condition about 
        #      front and back sequence.)
        assert mid_end > end
        result.append(Interval(end, mid_end)) 
        end     = mid_end
         
    # (3) Last surrogate border to End
    if End > end:
        result.append(Interval(end, End)) 

    return result
Example #2
0
def _split_contigous_intervals_for_surrogates(Begin, End):
    """Splits the interval X into sub interval so that no interval runs over a 'surrogate'
       border of the last word. For that, it is simply checked if the End falls into the
       same 'surrogate' domain of 'front' (start value of front = Begin). If it does not
       an interval [front, end_of_domain) is split up and front is set to end of domain.
       This procedure repeats until front and End lie in the same domain.
    """
    global ForbiddenRange
    assert Begin >= 0x10000
    assert End <= 0x110000
    assert End > Begin

    front_seq = unicode_to_utf16(Begin)
    back_seq = unicode_to_utf16(End - 1)

    # (*) First word is the same.
    #     Then,
    #       -- it is either a one word character.
    #       -- it is a range of two word characters, but the range
    #          extends in one contigous range in the second surrogate.
    #     In both cases, the interval is contigous.
    if front_seq[0] == back_seq[0]:
        return [Interval(Begin, End)]

    # (*) First word is NOT the same
    # Separate into three domains:
    #
    # (1) Interval from Begin until second surrogate hits border 0xE000
    # (2) Interval where the first surrogate inreases while second
    #     surrogate iterates over [0xDC00, 0xDFFF]
    # (3) Interval from begin of last surrogate border to End
    result = []
    end = utf16_to_unicode([front_seq[0], ForbiddenRange.end - 1]) + 1

    # (1) 'Begin' until second surrogate hits border 0xE000
    #      (The following **must** hold according to entry condition about
    #       front and back sequence.)
    assert End > end
    result.append(Interval(Begin, end))

    if front_seq[0] + 1 != back_seq[0]:
        # (2) Second surrogate iterates over [0xDC00, 0xDFFF]
        mid_end = utf16_to_unicode([back_seq[0] - 1, ForbiddenRange.end - 1
                                    ]) + 1
        #     (The following **must** hold according to entry condition about
        #      front and back sequence.)
        assert mid_end > end
        result.append(Interval(end, mid_end))
        end = mid_end

    # (3) Last surrogate border to End
    if End > end:
        result.append(Interval(end, End))

    return result
Example #3
0
def _get_trigger_sequence_for_interval(X):
    # The interval either lies entirely >= 0x10000 or entirely < 0x10000
    assert X.begin >= 0x10000 or X.end < 0x10000

    # An interval below < 0x10000 remains the same
    if X.end < 0x10000: return [ X ]
    
    # In case that the interval >= 0x10000 it the value is split up into
    # two values.
    front_seq = unicode_to_utf16(X.begin)
    back_seq  = unicode_to_utf16(X.end - 1)

    return [ Interval(front_seq[0], back_seq[0] + 1), 
             Interval(front_seq[1], back_seq[1] + 1) ]
Example #4
0
    def lexatom_n_per_character(self, CharacterSet):
        """If all characters in a unicode character set state machine require the
        same number of bytes to be represented this number is returned.  Otherwise,
        'None' is returned.

        RETURNS:   N > 0  number of bytes required to represent any character in the 
                          given state machine.
                   None   characters in the state machine require different numbers of
                          bytes.
        """
        assert isinstance(CharacterSet, NumberSet)

        interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True)
        front = interval_list[0].begin  # First element of number set
        back = interval_list[-1].end - 1  # Last element of number set
        # Determine number of bytes required to represent the first and the
        # last character of the number set. The number of bytes per character
        # increases monotonously, so only borders have to be considered.
        front_chunk_n = len(unicode_to_utf16(front))
        back_chunk_n = len(unicode_to_utf16(back))
        if front_chunk_n != back_chunk_n: return None
        else: return front_chunk_n
Example #5
0
    def lexatom_n_per_character(self, CharacterSet):
        """If all characters in a unicode character set state machine require the
        same number of bytes to be represented this number is returned.  Otherwise,
        'None' is returned.

        RETURNS:   N > 0  number of bytes required to represent any character in the 
                          given state machine.
                   None   characters in the state machine require different numbers of
                          bytes.
        """
        assert isinstance(CharacterSet, NumberSet)

        interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True)
        front = interval_list[0].begin     # First element of number set
        back  = interval_list[-1].end - 1  # Last element of number set
        # Determine number of bytes required to represent the first and the 
        # last character of the number set. The number of bytes per character
        # increases monotonously, so only borders have to be considered.
        front_chunk_n = len(unicode_to_utf16(front))
        back_chunk_n  = len(unicode_to_utf16(back))
        if front_chunk_n != back_chunk_n: return None
        else:                             return front_chunk_n
def test(UC):
    global error_n
    correct = reference_utf16_encoder(UC)
    output = unicode_to_utf16(UC)

    if correct != output:
        print "ERROR: unicode_to_utf16 with %06X" % UC
        print correct
        print output
        print correct[0] - output[0]
        error_n += 1

    backward = utf16_to_unicode(correct)

    if backward != UC:
        print "ERROR: utf16_to_unicode with %06X" % UC
        error_n += 1
Example #7
0
    print "CHOICES:    error-detect, plain;"
    sys.exit()

if "error-detect" in sys.argv:
    Setup.bad_lexatom_detection_f = True
else:
    Setup.bad_lexatom_detection_f = False

#   0x000000 - 0x00D7FF: 1 code unit = 2 byte = original UCS code
#   0x00E000 - 0x00FFFF: (same)
#   0x010000 - 0x110000: 2 code units = 4 byte = constructed from UCS code
#                        Range of 1st code unit: 0xD800..0xDBFF
#                                 2nd code unit: 0xDC00..0xDFFF
boarders = [0x000080, 0x00D7FF, 0x00E000, 0x00FFFF, 0x010000, 0x10FFFF]

good_sequences = [unicode_to_utf16(x) for x in boarders]

# Boarders of code unit ragnes which are encoding errors:
bad_1st_s = [0xDC00, 0xDFFF]  # boarders of disallowed CodeUnit[0]
bad_2nd_s = [0x0000, 0xDBFF, 0xE000,
             0x110000]  # boarders of disallowed CodeUnit[1]

good_sequences = [unicode_to_utf16(x) for x in boarders]

trafo = EncodingTrafoUTF16()
trafo.adapt_source_and_drain_range(LexatomByteN=4)
sm = helper.generate_sm_for_boarders(boarders, EncodingTrafoUTF16())

bad_sequence_list = helper.get_bad_sequences(good_sequences, bad_1st_s,
                                             bad_2nd_s)
Example #8
0
        byte1 = value >> 8
        assert value & 0xFFFF0000 == 0, "Value = %08X" % value
        result += chr(byte0) + chr(byte1)
    return result

# (1) Critical Borders for UTF8: 
#     0x80, 0x800, 0x10000, 0x10ffff (last code element)
# (2) Critical Borders for UTF16: 
#     0xD800, 0xE000
unicode_character_list =   range(0x01, 0x7F)            \
                         + range(0x800, 0x800 +10)      \
                         + range(0x10000, 0x10000 + 10) 

utf8_list    = ""
utf16_list   = ""
unicode_list = ""
for char in unicode_character_list:
    utf8_list    += utf8.map_unicode_to_utf8(char)
    utf16_list   += word_4_split(utf16.unicode_to_utf16(char))
    unicode_list += byte_4_split(char)

fh = open("example/utf8.txt", "wb")
fh.write(utf8_list)
fh.close()
fh = open("example/utf16le.txt", "wb")
fh.write("\xff\xfe" + utf16_list) # BOM: Little Endian
fh.close()
fh = open("example/ucs4le.txt", "wb")
fh.write(unicode_list)
fh.close()