def _split_contigous_intervals_for_surrogates(Begin, End): """Splits the interval X into sub interval so that no interval runs over a 'surrogate' border of the last word. For that, it is simply checked if the End falls into the same 'surrogate' domain of 'front' (start value of front = Begin). If it does not an interval [front, end_of_domain) is split up and front is set to end of domain. This procedure repeats until front and End lie in the same domain. """ global ForbiddenRange assert Begin >= 0x10000 assert End <= 0x110000 assert End > Begin front_seq = unicode_to_utf16(Begin) back_seq = unicode_to_utf16(End - 1) # (*) First word is the same. # Then, # -- it is either a one word character. # -- it is a range of two word characters, but the range # extends in one contigous range in the second surrogate. # In both cases, the interval is contigous. if front_seq[0] == back_seq[0]: return [Interval(Begin, End)] # (*) First word is NOT the same # Separate into three domains: # # (1) Interval from Begin until second surrogate hits border 0xE000 # (2) Interval where the first surrogate inreases while second # surrogate iterates over [0xDC00, 0xDFFF] # (3) Interval from begin of last surrogate border to End result = [] end = utf16_to_unicode([front_seq[0], ForbiddenRange.end - 1]) + 1 # (1) 'Begin' until second surrogate hits border 0xE000 # (The following **must** hold according to entry condition about # front and back sequence.) assert End > end result.append(Interval(Begin, end)) if front_seq[0] + 1 != back_seq[0]: # (2) Second surrogate iterates over [0xDC00, 0xDFFF] mid_end = utf16_to_unicode([back_seq[0] - 1, ForbiddenRange.end - 1]) + 1 # (The following **must** hold according to entry condition about # front and back sequence.) assert mid_end > end result.append(Interval(end, mid_end)) end = mid_end # (3) Last surrogate border to End if End > end: result.append(Interval(end, End)) return result
def _split_contigous_intervals_for_surrogates(Begin, End): """Splits the interval X into sub interval so that no interval runs over a 'surrogate' border of the last word. For that, it is simply checked if the End falls into the same 'surrogate' domain of 'front' (start value of front = Begin). If it does not an interval [front, end_of_domain) is split up and front is set to end of domain. This procedure repeats until front and End lie in the same domain. """ global ForbiddenRange assert Begin >= 0x10000 assert End <= 0x110000 assert End > Begin front_seq = unicode_to_utf16(Begin) back_seq = unicode_to_utf16(End - 1) # (*) First word is the same. # Then, # -- it is either a one word character. # -- it is a range of two word characters, but the range # extends in one contigous range in the second surrogate. # In both cases, the interval is contigous. if front_seq[0] == back_seq[0]: return [Interval(Begin, End)] # (*) First word is NOT the same # Separate into three domains: # # (1) Interval from Begin until second surrogate hits border 0xE000 # (2) Interval where the first surrogate inreases while second # surrogate iterates over [0xDC00, 0xDFFF] # (3) Interval from begin of last surrogate border to End result = [] end = utf16_to_unicode([front_seq[0], ForbiddenRange.end - 1]) + 1 # (1) 'Begin' until second surrogate hits border 0xE000 # (The following **must** hold according to entry condition about # front and back sequence.) assert End > end result.append(Interval(Begin, end)) if front_seq[0] + 1 != back_seq[0]: # (2) Second surrogate iterates over [0xDC00, 0xDFFF] mid_end = utf16_to_unicode([back_seq[0] - 1, ForbiddenRange.end - 1 ]) + 1 # (The following **must** hold according to entry condition about # front and back sequence.) assert mid_end > end result.append(Interval(end, mid_end)) end = mid_end # (3) Last surrogate border to End if End > end: result.append(Interval(end, End)) return result
def _get_trigger_sequence_for_interval(X): # The interval either lies entirely >= 0x10000 or entirely < 0x10000 assert X.begin >= 0x10000 or X.end < 0x10000 # An interval below < 0x10000 remains the same if X.end < 0x10000: return [ X ] # In case that the interval >= 0x10000 it the value is split up into # two values. front_seq = unicode_to_utf16(X.begin) back_seq = unicode_to_utf16(X.end - 1) return [ Interval(front_seq[0], back_seq[0] + 1), Interval(front_seq[1], back_seq[1] + 1) ]
def lexatom_n_per_character(self, CharacterSet): """If all characters in a unicode character set state machine require the same number of bytes to be represented this number is returned. Otherwise, 'None' is returned. RETURNS: N > 0 number of bytes required to represent any character in the given state machine. None characters in the state machine require different numbers of bytes. """ assert isinstance(CharacterSet, NumberSet) interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True) front = interval_list[0].begin # First element of number set back = interval_list[-1].end - 1 # Last element of number set # Determine number of bytes required to represent the first and the # last character of the number set. The number of bytes per character # increases monotonously, so only borders have to be considered. front_chunk_n = len(unicode_to_utf16(front)) back_chunk_n = len(unicode_to_utf16(back)) if front_chunk_n != back_chunk_n: return None else: return front_chunk_n
def test(UC): global error_n correct = reference_utf16_encoder(UC) output = unicode_to_utf16(UC) if correct != output: print "ERROR: unicode_to_utf16 with %06X" % UC print correct print output print correct[0] - output[0] error_n += 1 backward = utf16_to_unicode(correct) if backward != UC: print "ERROR: utf16_to_unicode with %06X" % UC error_n += 1
print "CHOICES: error-detect, plain;" sys.exit() if "error-detect" in sys.argv: Setup.bad_lexatom_detection_f = True else: Setup.bad_lexatom_detection_f = False # 0x000000 - 0x00D7FF: 1 code unit = 2 byte = original UCS code # 0x00E000 - 0x00FFFF: (same) # 0x010000 - 0x110000: 2 code units = 4 byte = constructed from UCS code # Range of 1st code unit: 0xD800..0xDBFF # 2nd code unit: 0xDC00..0xDFFF boarders = [0x000080, 0x00D7FF, 0x00E000, 0x00FFFF, 0x010000, 0x10FFFF] good_sequences = [unicode_to_utf16(x) for x in boarders] # Boarders of code unit ragnes which are encoding errors: bad_1st_s = [0xDC00, 0xDFFF] # boarders of disallowed CodeUnit[0] bad_2nd_s = [0x0000, 0xDBFF, 0xE000, 0x110000] # boarders of disallowed CodeUnit[1] good_sequences = [unicode_to_utf16(x) for x in boarders] trafo = EncodingTrafoUTF16() trafo.adapt_source_and_drain_range(LexatomByteN=4) sm = helper.generate_sm_for_boarders(boarders, EncodingTrafoUTF16()) bad_sequence_list = helper.get_bad_sequences(good_sequences, bad_1st_s, bad_2nd_s)
byte1 = value >> 8 assert value & 0xFFFF0000 == 0, "Value = %08X" % value result += chr(byte0) + chr(byte1) return result # (1) Critical Borders for UTF8: # 0x80, 0x800, 0x10000, 0x10ffff (last code element) # (2) Critical Borders for UTF16: # 0xD800, 0xE000 unicode_character_list = range(0x01, 0x7F) \ + range(0x800, 0x800 +10) \ + range(0x10000, 0x10000 + 10) utf8_list = "" utf16_list = "" unicode_list = "" for char in unicode_character_list: utf8_list += utf8.map_unicode_to_utf8(char) utf16_list += word_4_split(utf16.unicode_to_utf16(char)) unicode_list += byte_4_split(char) fh = open("example/utf8.txt", "wb") fh.write(utf8_list) fh.close() fh = open("example/utf16le.txt", "wb") fh.write("\xff\xfe" + utf16_list) # BOM: Little Endian fh.close() fh = open("example/ucs4le.txt", "wb") fh.write(unicode_list) fh.close()