def transform_forward(X): global trafo interval = Interval(X, X+1) verdict_f, result = interval.transform_by_table(trafo) if not verdict_f: return None assert len(result) == 1 return result[0].begin
def add_transition(self, Trigger, TargetStateIdx): """Adds a transition according to trigger and target index. RETURNS: The target state index (may be created newly). """ assert type(TargetStateIdx) == long \ or TargetStateIdx is None \ or TargetStateIdx in E_StateIndices, "%s" % TargetStateIdx.__class__.__name__ assert Trigger.__class__ in (int, long, list, Interval, NumberSet) or Trigger is None if Trigger is None: # This is a shorthand to trigger via the remaining triggers Trigger = self.get_trigger_set_union().get_complement(Setup.buffer_encoding.source_set) elif type(Trigger) == long: Trigger = Interval(int(Trigger), int(Trigger+1)) elif type(Trigger) == int: Trigger = Interval(Trigger, Trigger+1) elif type(Trigger) == list: Trigger = NumberSet(Trigger, ArgumentIsYoursF=True) if Trigger.__class__ == Interval: if self.__db.has_key(TargetStateIdx): self.__db[TargetStateIdx].add_interval(Trigger) else: self.__db[TargetStateIdx] = NumberSet(Trigger, ArgumentIsYoursF=True) else: if self.__db.has_key(TargetStateIdx): self.__db[TargetStateIdx].unite_with(Trigger) else: self.__db[TargetStateIdx] = Trigger return TargetStateIdx
def test(Begin, End): X = Interval(Begin, End) print "-------------------------" print "Interval: " + X.get_string(Option="hex") print " .front --> " + pretty_sequence(X.begin) print " .back --> " + pretty_sequence(X.end - 1) print L = len(trafo.unicode_to_utf8(X.begin)) assert L == len(trafo.unicode_to_utf8(X.end - 1)) result, p = trafo.split_interval_into_contigous_byte_sequence_range(X, L) print "Result:" previous_end = X.begin for interval in result: print " %s " % interval.get_string(Option="hex") # All sub intervals must be adjacent assert interval.begin == previous_end print " .front --> " + pretty_sequence(interval.begin) print " .back --> " + pretty_sequence(interval.end - 1) previous_end = interval.end # The whole interval has been spanned assert result[0].begin == X.begin assert result[-1].end == X.end
def do(BufferCodecName, BufferCodecFileName=""): from quex.engine.state_machine.transformation.base import EncodingTrafoUnicode from quex.engine.state_machine.transformation.table import EncodingTrafoByTable from quex.engine.state_machine.transformation.utf8_state_split import EncodingTrafoUTF8 from quex.engine.state_machine.transformation.utf16_state_split import EncodingTrafoUTF16 if BufferCodecName == "utf8": return EncodingTrafoUTF8() elif BufferCodecName == "utf16": return EncodingTrafoUTF16() elif BufferCodecFileName: os.path.splitext(os.path.basename(BufferCodecFileName)) try: os.path.splitext(os.path.basename(BufferCodecFileName)) except: error.log("cannot interpret string following '--codec-file'") return EncodingTrafoByTable(FileName=BufferCodecFileName) elif BufferCodecName == "unicode": # (Still, 'icu' or 'iconv' may provide converted content, but ...) # If the internal buffer is 'unicode', then the pattern's state # machines are not converted. The requirement for the pattern's # range is the same as for the 'buffer element chunks'. return EncodingTrafoUnicode(NumberSet(Interval(0, 0x110000)), NumberSet(Interval(0, 0x110000))) elif BufferCodecName == "unit-test": return EncodingTrafoUnicode(NumberSet_All(), NumberSet_All()) else: return EncodingTrafoByTable(BufferCodecName)
def _split_by_transformed_sequence_length(X): """Split Unicode interval into intervals where all values have the same utf8-byte sequence length. RETURNS: map: sequence length --> Unicode Sub-Interval of X. """ if X.begin < 0: X.begin = 0 if X.end > UTF8_MAX: X.end = UTF8_MAX + 1 if X.size() == 0: return None db = {} current_begin = X.begin last_L = len(unicode_to_utf8(X.end - 1)) # Length of utf8 sequence corresponding # # the last value inside the interval. while 1 + 1 == 2: L = len(unicode_to_utf8(current_begin)) # Length of the first unicode in utf8 # Store the interval together with the required byte sequence length (as key) current_end = UTF8_BORDERS[L-1] if L == last_L: db[L] = Interval(current_begin, X.end) break db[L] = Interval(current_begin, current_end) current_begin = current_end return db
def __init__(self): EncodingTrafoBySplit.__init__(self, "utf16", CodeUnitRange=NumberSet.from_range(0, 0x10000)) self.error_range_code_unit0 = NumberSet([ Interval(0x0000, 0xDC00), Interval(0xE000, 0x10000) ]).get_complement(NumberSet_All()) self.error_range_code_unit1 = NumberSet([ Interval(0xDC00, 0xE000) ]).get_complement(NumberSet_All())
def test_inverse(): print "INVERSE" print "--------------------------------------------------------------------------------" the_inverse("(a) normal", Interval(5000, 6000)) the_inverse("(b) lower border = - maxint", Interval(-sys.maxint, 6000)) the_inverse("(c) upper border = maxint", Interval(5000, sys.maxint)) the_inverse("(c) upper/lower border = +/- maxint", Interval(-sys.maxint, sys.maxint))
def test_inverse(): print "INVERSE" print "--------------------------------------------------------------------------------" the_inverse("(a) normal", Interval(5000, 6000)) the_inverse("(b) lower border = - maxint", Interval(-INTEGER_MAX, 6000)) the_inverse("(c) upper border = maxint", Interval(5000, INTEGER_MAX)) the_inverse("(c) upper/lower border = +/- maxint", Interval(-INTEGER_MAX, INTEGER_MAX))
def test(A0, A1, B0, B1): print "------------------------------------------------" A = Interval(A0, A1) B = Interval(B0, B1) print "%s < %s : %s" % (repr(A), repr(B), A < B) print "%s <= %s : %s" % (repr(A), repr(B), A <= B) print "%s == %s : %s" % (repr(A), repr(B), A == B) print "%s != %s : %s" % (repr(A), repr(B), A != B) print "%s >= %s : %s" % (repr(A), repr(B), A >= B) print "%s > %s : %s" % (repr(A), repr(B), A > B)
def _split_contigous_intervals_for_surrogates(Begin, End): """Splits the interval X into sub interval so that no interval runs over a 'surrogate' border of the last word. For that, it is simply checked if the End falls into the same 'surrogate' domain of 'front' (start value of front = Begin). If it does not an interval [front, end_of_domain) is split up and front is set to end of domain. This procedure repeats until front and End lie in the same domain. """ global ForbiddenRange assert Begin >= 0x10000 assert End <= 0x110000 assert End > Begin front_seq = unicode_to_utf16(Begin) back_seq = unicode_to_utf16(End - 1) # (*) First word is the same. # Then, # -- it is either a one word character. # -- it is a range of two word characters, but the range # extends in one contigous range in the second surrogate. # In both cases, the interval is contigous. if front_seq[0] == back_seq[0]: return [Interval(Begin, End)] # (*) First word is NOT the same # Separate into three domains: # # (1) Interval from Begin until second surrogate hits border 0xE000 # (2) Interval where the first surrogate inreases while second # surrogate iterates over [0xDC00, 0xDFFF] # (3) Interval from begin of last surrogate border to End result = [] end = utf16_to_unicode([front_seq[0], ForbiddenRange.end - 1]) + 1 # (1) 'Begin' until second surrogate hits border 0xE000 # (The following **must** hold according to entry condition about # front and back sequence.) assert End > end result.append(Interval(Begin, end)) if front_seq[0] + 1 != back_seq[0]: # (2) Second surrogate iterates over [0xDC00, 0xDFFF] mid_end = utf16_to_unicode([back_seq[0] - 1, ForbiddenRange.end - 1 ]) + 1 # (The following **must** hold according to entry condition about # front and back sequence.) assert mid_end > end result.append(Interval(end, mid_end)) end = mid_end # (3) Last surrogate border to End if End > end: result.append(Interval(end, End)) return result
def prepare(A_list, B_list): A = NumberSet() B = NumberSet() for begin, end in A_list: A.add_interval(Interval(begin, end)) for begin, end in B_list: B.add_interval(Interval(begin, end)) A.assert_consistency() B.assert_consistency() return A, B
def __do_this(A, B): interval_list = B.get_intervals() for interval in interval_list: print "#" print "# A = " + repr(A) print "# B = " + repr(interval) X = deepcopy(A) safe = Interval(interval.begin, interval.end) X.cut_interval(safe) X.assert_consistency() safe.begin = 7777 safe.end = 0000 print "# A.cut_interval(B) = " + repr(X)
def adapt_ranges_to_lexatom_type_range(self, LexatomTypeRange): self._adapt_error_ranges_to_lexatom_type_range(LexatomTypeRange) # UTF16 requires at least 2 byte for a 'normal code unit'. Anything else # requires to cut on the addmissible set of code points. if LexatomTypeRange.end < 0x10000: self.source_set.mask(0, LexatomTypeRange.end) else: self.source_set.mask(0, 0x110000) if LexatomTypeRange.end > 0x10000: self._error_range_by_code_unit_db[0].unite_with( Interval(0x10000, LexatomTypeRange.end)) self._error_range_by_code_unit_db[1].unite_with( Interval(0x10000, LexatomTypeRange.end))
def _get_trigger_sequence_for_interval(X): # The interval either lies entirely >= 0x10000 or entirely < 0x10000 assert X.begin >= 0x10000 or X.end < 0x10000 # An interval below < 0x10000 remains the same if X.end < 0x10000: return [ X ] # In case that the interval >= 0x10000 it the value is split up into # two values. front_seq = unicode_to_utf16(X.begin) back_seq = unicode_to_utf16(X.end - 1) return [ Interval(front_seq[0], back_seq[0] + 1), Interval(front_seq[1], back_seq[1] + 1) ]
def __init__(self): # A character in UTF16 is at maximum represented by two code units. # => Two error ranges. error_range_0 = NumberSet([ Interval(0x0000, 0xDC00), Interval(0xE000, 0x10000) ]).get_complement(NumberSet_All()) # Adapted later error_range_1 = NumberSet([Interval(0xDC00, 0xE000)]).get_complement( NumberSet_All()) # Adapted later error_range_by_code_unit_db = {0: error_range_0, 1: error_range_1} EncodingTrafoBySplit.__init__(self, "utf16", error_range_by_code_unit_db)
def __create_database_file(TargetEncoding, TargetEncodingName): """Writes a database file for a given TargetEncodingName. The TargetEncodingName is required to name the file where the data is to be stored. """ encoder = codecs.getencoder(TargetEncoding) prev_output = -1 db = [] bytes_per_char = -1 for input in range(0x110000): output, n = __get_transformation(encoder, input) if bytes_per_char == -1: bytes_per_char = n elif n != -1 and bytes_per_char != n: print "# not a constant size byte format." return False # Detect discontinuity in the mapping if prev_output == -1: if output != -1: input_interval = Interval(input) target_interval_begin = output elif output != prev_output + 1: # If interval was valid, append it to the database input_interval.end = input db.append((input_interval, target_interval_begin)) # If interval ahead is valid, prepare an object for it if output != -1: input_interval = Interval(input) target_interval_begin = output prev_output = output if prev_output != -1: input_interval.end = input db.append((input_interval, target_interval_begin)) fh = open_file_or_die(QUEX_CODEC_DB_PATH + "/%s.dat" % TargetEncoding, "wb") fh.write("// Describes mapping from Unicode Code pointer to Character code in %s (%s)\n" \ % (TargetEncoding, TargetEncodingName)) fh.write("// [SourceInterval.begin] [SourceInterval.Size] [TargetInterval.begin] (all in hexidecimal)\n") for i, t in db: fh.write("0x%X %i 0x%X\n" % (i.begin, i.end - i.begin, t)) fh.close() return True
def adapt_source_and_drain_range(self, LexatomByteN): """The drain range may be restricted due to the number of bytes given per lexatom. If the 'LexatomByteN' is '-1' it is unrestricted which may be useful for unit tests and theoretical investigations. DERIVED CLASS MAY HAVE TO WRITE A DEDICATED VERSION OF THIS FUNCTION TO MODIFY THE SOURCE RANGE '.source_set'. """ if LexatomByteN == -1: self.lexatom_range = Interval_All() return assert LexatomByteN >= 1 lexatom_min_value = self.drain_set.minimum() lexatom_max_value = self.drain_set.supremum() - 1 if LexatomByteN != -1: try: value_n = 256 ** LexatomByteN except: error.log("Error while trying to compute 256 power the 'lexatom-size' (%i bytes)\n" \ % LexatomByteN + \ "Adapt \"--buffer-element-size\" or \"--buffer-element-type\",\n" + \ "or specify '--buffer-element-size-irrelevant' to ignore the issue.") lexatom_min_value = 0 lexatom_max_value = min(lexatom_max_value, value_n - 1) lexatom_max_value = min(lexatom_max_value, sys.maxint) assert lexatom_max_value > lexatom_min_value self.lexatom_range = Interval(lexatom_min_value, lexatom_max_value + 1) self.drain_set.mask_interval(self.lexatom_range)
def do(self, UnicodeTrafoInfo, ProvidedConversionInfoF=False): """Creates code for a conversion to target encoding according to the conversion_table. """ # 'ProvidedConversionTableF' is only to be used for Unit Tests if ProvidedConversionInfoF: conversion_table = UnicodeTrafoInfo else: conversion_table = self.get_conversion_table(UnicodeTrafoInfo) assert all( isinstance(entry, ConversionInfo) for entry in conversion_table) # Make sure that the conversion table is sorted conversion_table.sort(key=attrgetter("codec_interval_begin")) def action(ci): return "{ %s %s }" % \ (self.get_offset_code(ci), self.jump_to_output_formatter(ci.code_unit_n)) if len(conversion_table) == 1: ci = conversion_table[0] txt = [" %s" % self.get_offset_code(ci)] txt.extend(self.unicode_to_output(ci.code_unit_n)) else: tm = [(Interval(ci.codec_interval_begin, ci.codec_interval_begin + ci.codec_interval_size), action(ci)) for ci in conversion_table] txt = [] transition_map.do(txt, tm, AssertBorderF=False) txt.append(self.unicode_to_output_all_ranges()) return "\n".join(txt)
def consider_interval(self, Begin, End): if Begin > End: raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \ "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \ (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End)) self.match_set.add_interval(Interval(Begin, End))
def get(self): # Transform 'cursor' into a number set result = NumberSet() K = len(self.__cursor) if K == 0: return None k = 0 end = 0 while k < K - 1: begin = end + self.__cursor[k] end = begin + self.__cursor[k + 1] if end > self.N: self.__cursor.pop() K -= 1 break if begin != end: result.quick_append_interval(Interval(begin, end)) k += 2 # Increment cursor k = 0 while k < K: if k == 0: self.__cursor[k] += 2 if self.__cursor[k] < 8: break else: self.__cursor[k] += 1 if self.__cursor[k] < 3: break self.__cursor[k] = 1 k += 1 return result
def construct_tm(IntervalList): letter = ord('a') return TransitionMap.from_iterable([ (Interval(x[0], x[1]), letter + i) for i, x in enumerate(IntervalList) ])
def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence): script_list = [ "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Buginese", "Buhid", "Canadian_Aboriginal", "Cherokee", "Common", "Cuneiform", "Cypriot", "Deseret", "Gothic", "Greek", "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Han", "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya", "Ogham", "Old_Italic", "Old_Persian", "Phoenician", "Shavian", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi" ] sets = [X(name) for name in script_list] orig = combination.do(map(lambda x: x.sm, sets)) state_n_before, result = transform(Trafo, orig) # print result.get_graphviz_string(Option="hex") for set in sets: set.check(result, unicode_to_transformed_sequence) print "Translated %i groups without abortion on error (OK)" % len(sets) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex") check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True), unicode_to_transformed_sequence)
def _get_contigous_intervals(X): """Split Unicode interval into intervals where all values have the same utf16-byte sequence length. This is fairly simple in comparison with utf8-byte sequence length: There are only two lengths: 2 bytes and 2 x 2 bytes. RETURNS: [X0, List1] X0 = the sub-interval where all values are 1 word (2 byte) utf16 encoded. None => No such interval List1 = list of contigous sub-intervals where coded as 2 words. None => No such intervals """ global ForbiddenRange if X.begin == -sys.maxint: X.begin = 0 if X.end == sys.maxint: X.end = 0x110000 assert X.end != X.begin # Empty intervals are nonsensical assert X.end <= 0x110000 # Interval must lie in unicode range assert not X.check_overlap(ForbiddenRange) # The 'forbidden range' is not to be covered. if X.end <= 0x10000: return [X, None] elif X.begin >= 0x10000: return [None, _split_contigous_intervals_for_surrogates(X.begin, X.end)] else: return [Interval(X.begin, 0x10000), _split_contigous_intervals_for_surrogates(0x10000, X.end)]
def test(Begin, End): X = Interval(Begin, End) print "-------------------------" print "Interval: " + X.get_string(Option="hex") print " .front --> " + pretty_sequence(X.begin) print " .back --> " + pretty_sequence(X.end - 1) print x0, list1 = trafo.get_contigous_intervals(X) print "Result:" print " Interval < 0x10000: ", if x0 is not None: print "%s" % x0.get_string(Option="hex") else: print "None" print " Intervals >= 0x10000: ", if list1 is None: print "None" else:
def get_trigger_sequence_for_contigous_byte_range_interval(X, L): front_sequence = unicode_to_utf8(X.begin) back_sequence = unicode_to_utf8(X.end - 1) # If the interval is contigous it must produce equal length utf8 sequences return [ Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L) ]
def test(Begin, End): X = Interval(Begin, End) print "-------------------------" print "Interval: " + X.get_string(Option="hex") print " .front --> " + pretty_sequence(X.begin) print " .back --> " + pretty_sequence(X.end - 1) print x0, list1 = trafo._get_contigous_intervals(X) print "Result:" print " Interval < 0x10000: ", if x0 is not None: print "%s" % x0.get_string(Option="hex") else: print "None" print " Intervals >= 0x10000: ", if list1 is None: print "None" else:
def __init__(self): error_range_0 = NumberSet([ Interval(0b00000000, 0b01111111 + 1), Interval(0b11000000, 0b11011111 + 1), Interval(0b11100000, 0b11101111 + 1), Interval(0b11110000, 0b11110111 + 1), Interval(0b11111000, 0b11111011 + 1), Interval(0b11111100, 0b11111101 + 1), ]).get_complement(NumberSet_All()) # Adapted later error_range_N = NumberSet(Interval(0b10000000, 0b10111111+1)) \ .get_complement(NumberSet_All()) # Adapted later error_range_by_code_unit_db = { 0: error_range_0, 1: error_range_N, 2: error_range_N, 3: error_range_N, 4: error_range_N, 5: error_range_N, 6: error_range_N, 7: error_range_N, 8: error_range_N } EncodingTrafoBySplit.__init__(self, "utf8", error_range_by_code_unit_db) self.UnchangedRange = 0x7F
def do(section_list, fh): """Parses a codec information file. The described codec can only be a 'static character length' encoding. That is every character in the code occupies the same number of bytes. RETURNS: [0] Set of characters in unicode which are covered by the described codec. [1] Range of values in the codec elements. """ source_set = NumberSet() drain_set = NumberSet() error_str = None try: while error_str is None: skip_whitespace(fh) source_begin = read_integer(fh) if source_begin is None: error_str = "Missing integer (source interval begin) in codec file." continue skip_whitespace(fh) source_size = read_integer(fh) if source_size is None: error_str = "Missing integer (source interval size) in codec file." continue skip_whitespace(fh) target_begin = read_integer(fh) if target_begin is None: error_str = "Missing integer (target interval begin) in codec file." continue source_end = source_begin + source_size list.append(section_list, [source_begin, source_end, target_begin]) source_set.add_interval(Interval(source_begin, source_end)) drain_set.add_interval( Interval(target_begin, target_begin + source_size)) except EndOfStreamException: pass return source_set, drain_set, error_str
def test(TM, Target="X"): tm = TransitionMap.from_iterable([ (Interval(x[0], x[1]), y) for x, y in TM ]) print "____________________________________________________________________" print "BEFORE:" show(tm) tm.combine_adjacents() tm.assert_continuity(StrictF=True) print "AFTER:" show(tm)
def test(TM, Target="X"): tm = TransitionMap([ (Interval(x[0], x[1]), y) for x, y in TM ]) print "____________________________________________________________________" print "BEFORE:" show(tm) tm.fill_gaps(Target, Setup.buffer_codec.source_set.minimum(), Setup.buffer_codec.source_set.supremum()) tm.assert_adjacency(ChangeF=True) print "AFTER:" show(tm)
def set_target(self, Character, NewTarget): """Set the target in the transition map for a given 'Character'. """ # Find the index of the interval which contains 'Character' i = TransitionMap.bisect(self, Character) if i is None: self.insert(0, (Interval(Character), NewTarget)) self.sort() return # Split the found interval, if necessary, so that the map # contains 'Character' --> 'NewTarget'. interval, target = self[i] assert interval.size() > 0 new_i = None if target == NewTarget: return # Nothing to be done elif interval.size() == 1: self[i] = (interval, NewTarget) new_i = i elif Character == interval.end - 1: self.insert(i + 1, (Interval(Character), NewTarget)) interval.end -= 1 new_i = i + 1 elif Character == interval.begin: self.insert(i, (Interval(Character), NewTarget)) interval.begin += 1 new_i = i else: self.insert(i + 1, (Interval(Character), NewTarget)) self.insert(i + 2, (Interval(Character + 1, interval.end), target)) interval.end = Character new_i = i + 1 # Combine adjacent intervals which trigger to the same target. self.combine_adjacents(new_i) self.assert_continuity() return
def parse_table(Filename, IntervalColumnList=[], NumberColumnList=[], NumberListColumnList=[], CommentF=False): """Columns in IntervalColumnList --> converted to Interval() objects NumberColumnList --> converted to integers (hex numbers) NumberListColumnList --> converted to integer list (hex numbers) """ fh = open_data_base_file(Filename) record_set = [] for line in fh.readlines(): comment_idx = line.find("#") comment = None if comment_idx != -1: comment = line[comment_idx + 1:] line = line[:comment_idx] if line == "" or line.isspace(): continue # append content to record set cells = map(lambda x: x.strip(), line.split(";")) for i in IntervalColumnList: fields = cells[i].split("..") # range: value0..value1 assert len(fields) in [1, 2] if len(fields) == 2: begin = int("0x" + fields[0], 16) end = int("0x" + fields[1], 16) + 1 else: begin = int("0x" + fields[0], 16) end = int("0x" + fields[0], 16) + 1 cells[i] = Interval(begin, end) for i in NumberColumnList: cells[i] = int("0x" + cells[i], 16) for i in NumberListColumnList: nl = [] for n in cells[i].split(): nl.append(int("0x" + n, 16)) cells[i] = nl # Sometimes, the comment is useful if CommentF: cells.append(comment) record_set.append(cells) # There is no need to decouple here, since the record_set is created # each time that the function is called. return record_set
def test(X): print "#_______________________________________________" nset = NumberSet([ Interval(x, y) for x, y in X]) clone = nset.clone() print "#NumberSet: %s" % nset result = nset.clone() result.complement(all) print "#NumberSet.inverse: %s" % result assert result.is_equal(nset.get_complement(all)) assert result.intersection(nset).is_empty() assert result.union(nset).is_all()