def general_checks(loop_map, appendix_sm_list): print "#_[ Checks ]__________________________________________________" print print "character sets do not intersect", all_set = NumberSet() for lei in loop_map: assert lei.character_set is not None assert not lei.character_set.has_intersection(all_set) all_set.unite_with(lei.character_set) print "[ok]" print "count actions do not appear more than once", count_action_couple_set = set() count_action_plain_set = set() exit_exists_f = False appendix_sm_id_set = set() for lei in loop_map: if lei.count_action is None: assert lei.appendix_sm_id is None exit_exists_f = True elif lei.appendix_sm_id is None: assert lei.incidence_id not in count_action_plain_set count_action_plain_set.add(lei.incidence_id) else: assert lei.incidence_id not in count_action_couple_set count_action_couple_set.add(lei.incidence_id) appendix_sm_id_set.add(lei.appendix_sm_id) print "[ok]" list_id_set = set(sm.get_id() for sm in appendix_sm_list) assert appendix_sm_id_set == list_id_set print "appendix sm-ids are the same in loop map and sm list: [ok]" print "exit character set exits: [%s]" % exit_exists_f print
def get(self): # Transform 'cursor' into a number set result = NumberSet() K = len(self.__cursor) if K == 0: return None k = 0 end = 0 while k < K - 1: begin = end + self.__cursor[k] end = begin + self.__cursor[k+1] if end > self.N: self.__cursor.pop() K -= 1 break if begin != end: result.quick_append_interval(Interval(begin, end)) k += 2 # Increment cursor k = 0 while k < K: if k == 0: self.__cursor[k] += 2 if self.__cursor[k] < 8: break else: self.__cursor[k] += 1 if self.__cursor[k] < 3: break self.__cursor[k] = 1 k += 1 return result
def buffer_codec_prepare(self, BufferCodecName, BufferCodecFileName=None, Module=None): """Determines: Setup.buffer_codec_name Setup.buffer_codec """ assert BufferCodecName == "unit-test" \ or self.__buffer_element_specification_done_f == True if BufferCodecName in ("utf8", "utf16"): assert Module is not None result = codec_db.CodecDynamicInfo(BufferCodecName, Module) elif BufferCodecFileName: os.path.splitext(os.path.basename(BufferCodecFileName)) try: os.path.splitext(os.path.basename(BufferCodecFileName)) except: error.log("cannot interpret string following '--codec-file'") result = codec_db.CodecTransformationInfo(FileName=BufferCodecFileName) elif BufferCodecName == "unicode": # (Still, 'icu' or 'iconv' may provide converted content, but ...) # If the internal buffer is 'unicode', then the pattern's state # machines are not converted. The requirement for the pattern's # range is the same as for the 'buffer element chunks'. result = codec_db.CodecInfo("unicode", NumberSet.from_range(0, self.get_character_value_limit()), NumberSet.from_range(0, self.get_character_value_limit())) elif BufferCodecName == "unit-test": result = codec_db.CodecInfo("unicode", NumberSet.from_range(-sys.maxint, sys.maxint), NumberSet.from_range(-sys.maxint, sys.maxint)) else: result = codec_db.CodecTransformationInfo(BufferCodecName) self.buffer_codec = result
def __init__(self): EncodingTrafoBySplit.__init__(self, "utf16", CodeUnitRange=NumberSet.from_range(0, 0x10000)) self.error_range_code_unit0 = NumberSet([ Interval(0x0000, 0xDC00), Interval(0xE000, 0x10000) ]).get_complement(NumberSet_All()) self.error_range_code_unit1 = NumberSet([ Interval(0xDC00, 0xE000) ]).get_complement(NumberSet_All())
def __get_remaining_set(self): ignored = (E_CharacterCountType.BAD, E_CharacterCountType.BEGIN_NEWLINE_SUPPRESSOR, E_CharacterCountType.BEGIN_NEWLINE, E_CharacterCountType.END_NEWLINE) result = NumberSet() for character_set, info in self.__map: if info.cc_type in ignored: continue result.unite_with(character_set) return result.get_complement(Setup.buffer_codec.source_set)
def prepare(A_list, B_list): A = NumberSet() B = NumberSet() for begin, end in A_list: A.add_interval(Interval(begin, end)) for begin, end in B_list: B.add_interval(Interval(begin, end)) A.assert_consistency() B.assert_consistency() return A, B
def load_Composition_Exclusion(self): # Column 0 contains what is interesting ... table = parse_table("CompositionExclusions.txt", NumberColumnList=[0]) number_set = NumberSet() for row in table: begin = row[0] number_set.quick_append_interval(Interval(begin, begin + 1)) number_set.clean() self.db["CE"].code_point_db = number_set
def verify(A, TrafoInfo): result = NumberSet() for interval in A.get_intervals(): for x in range(interval.begin, interval.end): for source_begin, source_end, target_begin in TrafoInfo: if x >= source_begin and x < source_end: offset = x - source_begin y = target_begin + offset result.add_interval(Interval(y)) result.assert_consistency() return result
def get_ending_character_set(self): """Returns the union of all characters that trigger to an acceptance state in the given state machine. This is to detect whether the newline or suppressor end with an indentation character (grid or space). """ result = NumberSet() for end_state_index in self.get_acceptance_state_index_list(): for state in self.states.itervalues(): if state.target_map.has_target(end_state_index) == False: continue result.unite_with(state.target_map.get_trigger_set_to_target(end_state_index)) return result
def test(X): print "#_______________________________________________" nset = NumberSet([ Interval(x, y) for x, y in X]) clone = nset.clone() print "#NumberSet: %s" % nset result = nset.clone() result.complement(all) print "#NumberSet.inverse: %s" % result assert result.is_equal(nset.get_complement(all)) assert result.intersection(nset).is_empty() assert result.union(nset).is_all()
def __wildcard_value_match(self, WildCardValue): result = NumberSet() value_list = self.get_wildcard_value_matches(WildCardValue) if len(value_list) == 0: return None for value in value_list: result.unite_with(NumberSet(self.code_point_db[value])) # No decoupling, since result is computed each fresh and new return result
def the_intersection(Comment, A, B): if B.__class__ == Interval: B = NumberSet(B) print "#\n#" + Comment print "# A = " + repr(A) print "# B = " + repr(B) result = A.intersection(B) result.assert_consistency() print "# intersection(A,B) = " + repr(result) result = B.intersection(A) result.assert_consistency() print "# intersection(B,A) = " + repr(result)
def _get_loop_map(TheCountMap, SmList, IidLoopExit): """A loop map tells about the behavior of the core loop. It tells what needs to happen as a consequence to an incoming character. Two options: -- Return to loop (normal case) -- Enter the tail (appendix) of a parallel state machine. RETURNS: List of LoopMapEntry-s. A LoopMapEntry consists of: .character_set: Character set that triggers. .count_action: Count action related to the character set. == None, if the character set causes 'loop exit'. .incidence_id: Incidence Id of terminal that is triggered by character set. -- incidence id of count action terminal, or -- incidence id of couple terminal. .appendix_sm: Appendix state machine -- combined appendix state machines, or -- None, indicating that there is none. """ L = TheCountMap.loop_character_set() # 'couple_list': Transitions to 'couple terminals' # => connect to appendix state machines couple_list, \ appendix_sm_list = _get_LoopMapEntry_list_parallel_state_machines(TheCountMap, SmList) L_couple = NumberSet.from_union_of_iterable( lei.character_set for lei in couple_list ) # 'plain_list': Transitions to 'normal terminals' # => perform count action and loop. L_plain = L.difference(L_couple) plain_list = _get_LoopMapEntry_list_plain(TheCountMap, L_plain) # 'L_exit': Transition to exit # => remaining characters cause exit. L_loop = NumberSet.from_union_of_iterable( x.character_set for x in chain(couple_list, plain_list) ) universal_set = Setup.buffer_codec.source_set L_exit = L_loop.get_complement(universal_set) exit_list = [ LoopMapEntry(L_exit, None, IidLoopExit, None) ] result = couple_list + plain_list + exit_list assert not any(lei is None for lei in result) assert not any(lei.character_set is None for lei in result) assert not any(lei.incidence_id is None for lei in result) return result, appendix_sm_list
def create_ALL_BUT_NEWLINE_state_machine(stream): global Setup result = StateMachine() # NOTE: Buffer control characters are supposed to be filtered out by the code # generator. trigger_set = NumberSet(Interval(ord("\n"))).get_complement(Setup.buffer_codec.source_set) if trigger_set.is_empty(): error.log("The set of admissible characters contains only newline.\n" "The '.' for 'all but newline' is an empty set.", SourceRef.from_FileHandle(stream)) result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) return result
def __init__(self): drain_set = NumberSet.from_range(0, 0x100) EncodingTrafoBySplit.__init__(self, "utf8", CodeUnitRange=drain_set) self.UnchangedRange = 0x7F self.error_range_byte0 = NumberSet([ Interval(0b00000000, 0b01111111+1), Interval(0b11000000, 0b11011111+1), Interval(0b11100000, 0b11101111+1), Interval(0b11110000, 0b11110111+1), Interval(0b11111000, 0b11111011+1), Interval(0b11111100, 0b11111101+1), ]).get_complement(NumberSet_All()) self.error_range_byteN = NumberSet( Interval(0b10000000, 0b10111111+1) ).get_complement(NumberSet_All())
class Tracker: def __init__(self): self.match_set = NumberSet() self.negation_f = False def consider_interval(self, Begin, End): if Begin > End: raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \ "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \ (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End)) self.match_set.add_interval(Interval(Begin, End)) def consider_letter(self, CharCode): self.consider_interval(CharCode, CharCode+1)
def get_setup(L0, L1, FSM0, FSM1, FSM2): # SPECIALITIES: -- sm0 and sm1 have an intersection between their second # transition. # -- sm1 transits further upon acceptance. # -- sm2 has only one transition. ci_list = [ CountInfo(dial_db.new_incidence_id(), NumberSet.from_range(L0, L1), CountAction(E_CharacterCountType.COLUMN, 0)), ] # Generate State Machine that does not have any intersection with # the loop transitions. sm0 = StateMachine() si = sm0.add_transition(sm0.init_state_index, FSM0) si = sm0.add_transition(si, NS_A, AcceptanceF=True) sm0.states[si].mark_acceptance_id(dial_db.new_incidence_id()) sm1 = StateMachine() si0 = sm1.add_transition(sm1.init_state_index, FSM1) si = sm1.add_transition(si0, NS_A, AcceptanceF=True) iid1 = dial_db.new_incidence_id() sm1.states[si].mark_acceptance_id(iid1) si = sm1.add_transition(si, NS_B, si0) sm1.states[si].mark_acceptance_id(iid1) sm2 = StateMachine() si = sm2.add_transition(sm2.init_state_index, FSM2, AcceptanceF=True) sm2.states[si].mark_acceptance_id(dial_db.new_incidence_id()) return ci_list, [sm0, sm1, sm2]
def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence): script_list = [ "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Buginese", "Buhid", "Canadian_Aboriginal", "Cherokee", "Common", "Cuneiform", "Cypriot", "Deseret", "Gothic", "Greek", "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Han", "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya", "Ogham", "Old_Italic", "Old_Persian", "Phoenician", "Shavian", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi" ] sets = [ X(name) for name in script_list ] orig = get_combined_state_machine(map(lambda x: x.sm, sets)) state_n_before, result = transform(Trafo, orig) # print result.get_graphviz_string(Option="hex") for set in sets: set.check(result, unicode_to_transformed_sequence) print "Translated %i groups without abortion on error (OK)" % len(sets) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex") check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True), unicode_to_transformed_sequence)
def test(UC): global trafo_cp037 x = NumberSet(UC) y = x.clone() x.transform_by_table(trafo_cp037) x.assert_consistency() print "0x%02X --> 0x%s" % (UC, x.get_string(Option="hex"))
def is_DFA_compliant(self): """Checks if the current state transitions are DFA compliant, i.e. it investigates if trigger sets pointing to different targets intersect. RETURNS: True => OK False => Same triggers point to different target. This cannot be part of a deterministic finite automaton (DFA). """ # DFA's do not have epsilon transitions if len(self.__epsilon_target_index_list) != 0: return False # check whether trigger sets intersect all_trigger_sets = NumberSet() for trigger_set in self.__db.itervalues(): if all_trigger_sets.has_intersection(trigger_set): return False else: all_trigger_sets.unite_with(trigger_set) return True
def test(ci_list, SM_list=[]): Setup.buffer_codec.source_set = NumberSet_All() ci_map = CountInfoMap(ci_list, NumberSet.from_range(0, 100)) iid_loop_exit = dial_db.new_incidence_id() loop_map, appendix_sm_list = loop._get_loop_map(ci_map, SM_list, iid_loop_exit) print print print general_checks(loop_map, appendix_sm_list) print_this(loop_map, appendix_sm_list)
def get_number_set(Cursor): if len(Cursor) == 2: return S_None cursor = copy(Cursor) cursor.pop( 0 ) # element 0 and '-1' are just helping values, no interval borders. result = [] while len(cursor) != 1: begin = cursor.pop(0) end = cursor.pop(0) result.append(Interval(begin, end)) return NumberSet(result)
def buffer_codec_prepare(self, BufferCodecName, BufferCodecFileName=None, Module=None): """Determines: Setup.buffer_codec_name Setup.buffer_codec """ assert BufferCodecName == "unit-test" \ or self.__buffer_element_specification_done_f == True if BufferCodecName in ("utf8", "utf16"): assert Module is not None result = codec_db.CodecDynamicInfo(BufferCodecName, Module) elif BufferCodecFileName: os.path.splitext(os.path.basename(BufferCodecFileName)) try: os.path.splitext(os.path.basename(BufferCodecFileName)) except: error.log("cannot interpret string following '--codec-file'") result = codec_db.CodecTransformationInfo( FileName=BufferCodecFileName) elif BufferCodecName == "unicode": # (Still, 'icu' or 'iconv' may provide converted content, but ...) # If the internal buffer is 'unicode', then the pattern's state # machines are not converted. The requirement for the pattern's # range is the same as for the 'buffer element chunks'. result = codec_db.CodecInfo( "unicode", NumberSet.from_range(0, self.get_character_value_limit()), NumberSet.from_range(0, self.get_character_value_limit())) elif BufferCodecName == "unit-test": result = codec_db.CodecInfo( "unicode", NumberSet.from_range(-sys.maxint, sys.maxint), NumberSet.from_range(-sys.maxint, sys.maxint)) else: result = codec_db.CodecTransformationInfo(BufferCodecName) self.buffer_codec = result
def test(Border, List): x = NumberSet([Interval(a, b) for a, b in List]) y = deepcopy(x) z = deepcopy(x) print "Border: %s" % Border print "NumberSet: %s" % x x.cut_lesser(Border) x.assert_consistency() y.cut_greater_or_equal(Border) x.assert_consistency() print "cut_lesser --> %s" % x print "cut_greater_or_equal --> %s" % y print "______________________________________" assert x.union(y).is_equal(z)
def __display_set(CharSet, cl): if Setup.query_numeric_f: display = "hex" else: display = "utf8" CharSet.intersect_with(NumberSet(Interval(0, 0x110000))) print "Characters:\n" if Setup.query_interval_f: __print_set_in_intervals(CharSet, display, 80) elif Setup.query_unicode_names_f: __print_set_character_names(CharSet, display, 80) else: __print_set_single_characters(CharSet, display, 80) print
def do(section_list, fh): """Parses a codec information file. The described codec can only be a 'static character length' encoding. That is every character in the code occupies the same number of bytes. RETURNS: [0] Set of characters in unicode which are covered by the described codec. [1] Range of values in the codec elements. """ source_set = NumberSet() drain_set = NumberSet() error_str = None try: while error_str is None: skip_whitespace(fh) source_begin = read_integer(fh) if source_begin is None: error_str = "Missing integer (source interval begin) in codec file." continue skip_whitespace(fh) source_size = read_integer(fh) if source_size is None: error_str = "Missing integer (source interval size) in codec file." continue skip_whitespace(fh) target_begin = read_integer(fh) if target_begin is None: error_str = "Missing integer (target interval begin) in codec file." continue source_end = source_begin + source_size list.append(section_list, [source_begin, source_end, target_begin]) source_set.add_interval(Interval(source_begin, source_end)) drain_set.add_interval( Interval(target_begin, target_begin + source_size)) except EndOfStreamException: pass return source_set, drain_set, error_str
def get_incidence_id_map(self, BeyondIncidenceId=None): """RETURNS: A list of pairs: (character_set, incidence_id) All same counting actions are referred to by the same incidence id. If BeyondIncidenceId is given, then the remaining set of characters is associated with 'BeyondIncidenceId'. """ result = [ (x.character_set, x.incidence_id) for x in self.__map ] if BeyondIncidenceId is None: return result all_set = NumberSet.from_union_of_iterable(x.character_set for x in self.__map) beyond_set = all_set.get_complement(Setup.buffer_codec.source_set) if not beyond_set.is_empty(): result.append((beyond_set, BeyondIncidenceId)) return result
def get_incidence_id_map(self, BeyondIncidenceId=None): """RETURNS: A list of pairs: (character_set, incidence_id) All same counting actions are referred to by the same incidence id. If BeyondIncidenceId is given, then the remaining set of characters is associated with 'BeyondIncidenceId'. """ result = [(x.character_set, x.incidence_id) for x in self.__map] if BeyondIncidenceId is None: return result all_set = NumberSet.from_union_of_iterable(x.character_set for x in self.__map) beyond_set = all_set.get_complement(Setup.buffer_codec.source_set) if not beyond_set.is_empty(): result.append((beyond_set, BeyondIncidenceId)) return result
def the_difference(Comment, A, B, ViceVersaF=True): if B.__class__ == Interval: B = NumberSet(B) def __do_this(A, B): interval_list = B.get_intervals() for interval in interval_list: print "#" print "# A = " + repr(A) print "# B = " + repr(interval) X = deepcopy(A) safe = Interval(interval.begin, interval.end) X.cut_interval(safe) X.assert_consistency() safe.begin = 7777 safe.end = 0000 print "# A.cut_interval(B) = " + repr(X) print "#\n# " + Comment + "_" * (80 - len(Comment)) __do_this(A, B) if ViceVersaF: __do_this(B, A)
def convert_table_to_associative_map(table, ValueColumnIdx, ValueType, KeyColumnIdx): """Produces a dictionary that maps from 'keys' to NumberSets. The number sets represent the code points for which the key (property) is valid. ValueColumnIdx: Column that contains the character code interval or string to which one wishes to map. KeyColmnIdx: Column that contains the 'key' to be used for the map self.db = database to contain the associative map. """ db = {} if ValueType == "NumberSet": for record in table: key = record[KeyColumnIdx].strip() key = key.replace(" ", "_") value = record[ValueColumnIdx] if type(value) == int: value = Interval(value) db.setdefault(key, NumberSet()).quick_append_interval(value, SortF=False) elif ValueType == "number" or ValueType == "string": for record in table: key = record[KeyColumnIdx].strip() key = key.replace(" ", "_") value = record[ValueColumnIdx] db[key] = value else: raise BaseException("ValueType = '%s' unknown.\n" % ValueType) # if the content was a number set, it might be simplified, try it. if ValueType == "NumberSet": for key, number_set in db.items(): number_set.clean() return db
def __whitespace_default(self): """Try to define default whitespace ' ' or '\t' if their positions are not yet occupied in the count_command_map. """ cs0 = NumberSet(ord(" ")) cs1 = NumberSet(ord("\t")) result = NumberSet() if not self.specifier_count_op_map.find_occupier(cs0, set()): result.unite_with(cs0) if not self.specifier_count_op_map.find_occupier(cs1, set()): result.unite_with(cs1) if result.is_empty(): error.log("Trying to implement default whitespace ' ' or '\\t' failed.\n" "Characters are occupied by other elements.", self.sr) return result
def unary(TheList): global correct_n global X correct_n = 0 X = NumberSet([Interval(p,q) for p,q in TheList]) print "# %s ---------------------" % X equal("inv(inv(X))", "X") equal("uni(X, inv(X))", "All") equal("uni(inv(X), X)", "All") equal("uni(X, None)", "X") equal("uni(None, X)", "X") equal("uni(X, All)", "All") equal("uni(All, X)", "All") equal("itsct(X, inv(X))", "None") equal("itsct(inv(X), X)", "None") equal("itsct(X, None)", "None") equal("itsct(None, X)", "None") equal("itsct(X, All)", "X") equal("itsct(All, X)", "X") equal("diff(X, inv(X))", "X") equal("diff(inv(X), X)", "inv(X) ") equal("diff(X, None)", "X") equal("diff(None, X)", "None") equal("diff(X, All)", "None") equal("diff(All, X)", "inv(X) ") equal("symdiff(X, inv(X))", "All") equal("symdiff(inv(X), X)", "All") equal("symdiff(X, None)", "X") equal("symdiff(None, X)", "X") equal("symdiff(X, All)", "inv(X)") equal("symdiff(All, X)", "inv(X) ") print "No abort --> %i x korrekt" % correct_n return
def do(BufferCodecName, BufferCodecFileName=""): from quex.engine.state_machine.transformation.base import EncodingTrafoUnicode, \ EncodingTrafoNone from quex.engine.state_machine.transformation.table import EncodingTrafoByTable from quex.engine.state_machine.transformation.utf8_state_split import EncodingTrafoUTF8 from quex.engine.state_machine.transformation.utf16_state_split import EncodingTrafoUTF16 if BufferCodecName == "none": return EncodingTrafoNone() elif BufferCodecName == "utf8": return EncodingTrafoUTF8() elif BufferCodecName == "utf16": return EncodingTrafoUTF16() elif BufferCodecFileName: os.path.splitext(os.path.basename(BufferCodecFileName)) try: os.path.splitext(os.path.basename(BufferCodecFileName)) except: error.log("cannot interpret string following '--encoding-file'") return EncodingTrafoByTable(FileName=BufferCodecFileName) elif BufferCodecName in ("unicode", "utf32"): # (Still, 'icu' or 'iconv' may provide converted content, but ...) # If the internal buffer is 'unicode', then the pattern's state # machines are not converted. The requirement for the pattern's # range is the same as for the 'buffer element chunks'. return EncodingTrafoUnicode(NumberSet(Interval(0, 0x110000)), Name=BufferCodecName) elif BufferCodecName == "unit-test": return EncodingTrafoUnicode(NumberSet_All(), NumberSet_All()) else: return EncodingTrafoByTable(BufferCodecName)
def do(section_list, fh): """Parses a codec information file. The described codec can only be a 'static character length' encoding. That is every character in the code occupies the same number of bytes. RETURNS: [0] Set of characters in unicode which are covered by the described codec. [1] Range of values in the codec elements. """ source_set = NumberSet() drain_set = NumberSet() error_str = None try: while error_str is None: skip_whitespace(fh) source_begin = read_integer(fh) if source_begin is None: error_str = "Missing integer (source interval begin) in codec file." continue skip_whitespace(fh) source_size = read_integer(fh) if source_size is None: error_str = "Missing integer (source interval size) in codec file." continue skip_whitespace(fh) target_begin = read_integer(fh) if target_begin is None: error_str = "Missing integer (target interval begin) in codec file." continue source_end = source_begin + source_size list.append(section_list, [source_begin, source_end, target_begin]) source_set.add_interval(Interval(source_begin, source_end)) drain_set.add_interval(Interval(target_begin, target_begin + source_size)) except EndOfStreamException: pass return source_set, drain_set, error_str
if len(sys.argv) < 2 or not (sys.argv[1] in [ "ANSI-C-PlainMemory", "ANSI-C", "Cpp", "Cpp_StrangeStream" ]): print "Language argument not acceptable, use --hwut-info" sys.exit(0) Language = sys.argv[1] __Setup_init_language_database(Language) StrangeStream_str = "" if Language.find("StrangeStream") != -1: StrangeStream_str = " -DQUEX_OPTION_STRANGE_ISTREAM_IMPLEMENTATION " trigger_set = NumberSet( [Interval(ord('a'), ord('z') + 1), Interval(ord('A'), ord('Z') + 1)]) TestStr = "abcdefg_HIJKLMNOP-qrstuvw'XYZ12ok3" compile_and_run(Language, create_character_set_skipper_code(Language, TestStr, trigger_set), StrangeStream_str=StrangeStream_str) TestStr = "-hijklmnop_qrstuvw#xyz9" compile_and_run(Language, create_character_set_skipper_code(Language, TestStr, trigger_set),
if "1" in sys.argv: def test(UC): global trafo_cp037 x = NumberSet(UC) y = x.clone() x.transform_by_table(trafo_cp037) x.assert_consistency() print "0x%02X --> 0x%s" % (UC, x.get_string(Option="hex")) for letter in xrange(-2, 258): test(letter) elif "all" in sys.argv: x = NumberSet(Interval(0, 0x100)) y = x.clone() x.transform_by_table(trafo_cp037) x.assert_consistency() print "0x%s --> 0x%s" % (y, x.get_string(Option="hex")) elif "some" in sys.argv: x = NumberSet(Interval(0, 0x32)) y = x.clone() x.transform_by_table(trafo_cp037) x.assert_consistency() print "0x%s --> 0x%s" % (y, x.get_string(Option="hex")) x = NumberSet(Interval(0x42, 0x80)) y = x.clone() x.transform_by_table(trafo_cp037)
def __init__(self): self.match_set = NumberSet() self.negation_f = False
if ta.door_id in done: continue assert len(ta.command_list) == 1 cmd = ta.command_list[0] print "%s => %s" % (ta.door_id, cmd.content.router_element) done.add(ta.door_id) def print_this(AnalyzerList): print "#_[ Print %i analyzer(s) ]______________________________" % len(AnalyzerList) print for i, analyzer in enumerate(AnalyzerList): print "--( %i: init si = %i )-------------------------\n" % (i, analyzer.init_state_index) print analyzer print_drop_out(analyzer) NS_A = NumberSet.from_range(0x600, 0x601) # UTF8: D8 80 => 216, 128 NS_B = NumberSet.from_range(0x601, 0x602) # UTF8: D8 81 => 216, 129 NS_C = NumberSet.from_range(0x640, 0x641) # UTF8: D9 80 => 217, 128 appendix_sm_id = 4711L if "loop" in sys.argv: loop_map = loop.LoopMap([ TestLME(NS_A, dial.new_incidence_id(), None), ]) column_n_per_code_unit = 5 elif "appendix" in sys.argv: loop_map = loop.LoopMap([ TestLME(NS_A, dial.new_incidence_id(), appendix_sm_id), # appendix_sm_id ]) column_n_per_code_unit = 5
def get_sm(SmId, Trigger): sm = StateMachine.from_IncidenceIdMap([ (NumberSet.from_range(Trigger, Trigger + 1), SmId) ]) sm.set_id(SmId) return sm
def get_codec_element_range(): """Codec element's size is 1 byte.""" return NumberSet.from_range(0, 0x100)
def get_elementary_trigger_sets(StateIdxList, sm=None, epsilon_closure_db=None): """NOTE: 'epsilon_closure_db' must previously be calculcated by sm.get_epsilon_closure_db(). This has to happen once and for all in order to save computation time. TODO: Performance--at the bottom of this file there is a class that might be directly used for indexing into a dictionary for caching the epsilon closures: MultiOccurrenceNumberList. (Tests showed that in average the a state combination requires 6x to evaluate into a closure). Considers the trigger dictionary that contains a mapping from target state index to the trigger set that triggers to it: target_state_index ---> trigger_set The trigger sets of different target state indices may intersect. As a result, this function produces a list of pairs: [ state_index_list, elementary_trigger_set ] where the elementary trigger set is the set of all triggers that trigger at the same time to all states in the state_index_list. The list contains for one state_index_list only one elementary_trigger_set. All elementary trigger sets are disjunct, i.e. they do not intersect. NOTE: A general solution of this problem would have to consider the inspection of all possible subset combinations. The number of combinations for N trigger sets is 2^N - which potentially blows the calculation power of the computer. Excessive optimizations would have to be programmed, if not the following were the case: NOTE: Fortunately, we are dealing with one dimensional sets! Thus, there is a very effective way to determine the elementary trigger sets. Imagine three trigger sets stretching over the range of numbers as follows: different targets, e.g. T0, T1, T2 are triggered by different sets of letters in the alphabet. letters of alphabet ----------------------------------------------------> T0 [---------) [----------) T1 [------) [-----) T2 [----------------------) => elementary sets: only T0 [-------) T0, T1 [-) only T1 [-) T1, T2 [--) only T2 [---) [----) T0, T2 [---) [) T0, T1, T2 [-----) """ # For Documentation Purposes: The following approach has been proven to be SLOWER # then the one currently implemented. May be, some time # it can be tweaked to be faster. # # Also, it is not proven to be correct! # ## trigger_list = [] ## for state_index in StateIdxList: ## state = sm.states[state_index] ## for target_index, trigger_set in state.target_map: ## target_epsilon_closure = epsilon_closure_db[target_index] ## interval_list = trigger_set.get_intervals(PromiseToTreatWellF=True) ## trigger_list.extend([x, target_epsilon_closure] for x in interval_list]) ## ## trigger_list.sort(key=lambda x: x[0].begin) ## for element in trigger_list: ## # ... continue as shown below ## ## return combination_list ## Special Case -- Quickly Done: One DFA_State, One Target DFA_State ## (Improvement is merely measurable). ## if len(StateIdxList) == 1: ## state_idx = list(StateIdxList)[0] ## if len(epsilon_closure_db[state_idx]) == 1: ## tm = sm.states[state_idx].target_map.get_map() ## if not tm: ## return {} ## elif len(tm) == 1: ## target, trigger_set = tm.iteritems().next() ## current_target_epsilon_closure = epsilon_closure_db[target] ## return { tuple(sorted(current_target_epsilon_closure)): trigger_set } ## TODO: Get the epsilon closure before the loop over history! ## ## sm.get_epsilon_closure_of_state_set(current_target_indices, ## epsilon_closure_db) # (*) Accumulate the transitions for all states in the state list. # transitions to the same target state are combined by union. history = _get_plain_line_up( [sm.states[si].target_map for si in StateIdxList]) # (*) build the elementary subset list combinations = {} # use dictionary for uniqueness current_interval_begin = None current_target_indices = {} # use dictionary for uniqueness current_target_epsilon_closure = [] for item in history: # -- add interval and target indice combination to the data # (only build interval when current begin is there, # when the interval size is not zero, and # when the epsilon closure of target states is not empty) if current_interval_begin is not None \ and current_interval_begin != item.position \ and len(current_target_indices) != 0: interval = Interval(current_interval_begin, item.position) # key = tuple(current_target_epsilon_closure) key = tuple(sorted(current_target_epsilon_closure)) combination = combinations.get(key) if combination is None: combinations[key] = NumberSet(interval, ArgumentIsYoursF=True) else: combination.unite_with(interval) # -- BEGIN / END of interval: # add or delete a target state to the set of currently considered target states # NOTE: More than one state can trigger on the same range to the same target state. # Thus, one needs to keep track of the 'opened' target states. if item.change == E_Border.BEGIN: if current_target_indices.has_key(item.target_idx): current_target_indices[item.target_idx] += 1 else: current_target_indices[item.target_idx] = 1 else: # == E_Border.END if item.target_idx not in current_target_indices: print "#ERROR:", history if current_target_indices[item.target_idx] > 1: current_target_indices[item.target_idx] -= 1 else: del current_target_indices[item.target_idx] # -- re-compute the epsilon closure of the target states current_target_epsilon_closure = \ sm.get_epsilon_closure_of_state_set(current_target_indices, epsilon_closure_db) # -- set the begin of interval to come current_interval_begin = item.position ## if proposal is not None: ## if len(proposal) != len(combinations) \ ## or proposal.keys() != combinations.keys() \ ## or not proposal.values()[0].is_equal(combinations.values()[0]): ## print "##proposal: ", proposal ## print "##combinations:", combinations # (*) create the list of pairs [target-index-combination, trigger_set] return combinations
"Buhid", "Canadian_Aboriginal", "Cherokee", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Yi", ]) orig = get_combined_state_machine(map(lambda x: x.sm, sets)) print "# Number of states in state machine:" print "# Unicode: %i" % len(orig.states) result = trafo.do(orig) print "# UTF8-Splitted: %i" % len(result.states) # print result.get_graphviz_string(Option="hex") for set in sets: set.check(result) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex") check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True))
def __init__(self, Name, CodeUnitRange): base.EncodingTrafo.__init__(self, Name, NumberSet.from_range(0, 0x110000), CodeUnitRange)
sys.exit(0) if len(sys.argv) < 2: print "Argument not acceptable, use --hwut-info" sys.exit(0) BS = int(sys.argv[1]) if BS not in [5, 6, 7, 8]: print "Argument not acceptable, use --hwut-info" sys.exit(0) Language = "Cpp" __Setup_init_language_database(Language) trigger_set = NumberSet([Interval(0x600, 0x700)]) Setup.buffer_codec_set(bc_factory.do("utf8"), 1) def make(TriggerSet, BufferSize): Language = "ANSI-C-from-file" code = create_character_set_skipper_code(Language, "", TriggerSet, QuexBufferSize=BufferSize, InitialSkipF=False, OnePassOnlyF=True) exe_name, tmp_file_name = compile(Language, code) return exe_name, tmp_file_name
def _get_loop_map(loop_config, CaMap, SmList, IidLoopExit, L_subset): """A loop map tells about the behavior of the core loop. It tells what needs to happen as a consequence to an incoming character. Two options: L_subset = NumberSet containing characters that are actually part of the loop. 'None' => all characters of 'CaMap' are considered. -- Return to loop (normal case) -- Enter the tail (appendix) of a parallel state machine. RETURNS: List of LoopMapEntry-s. A LoopMapEntry consists of: .character_set: Character set that triggers. .count_action: Count action related to the character set. == None, if the character set causes 'loop exit'. .iid_couple_terminal: Incidence Id of terminal that is triggered by character set. -- incidence id of count action terminal, or -- incidence id of couple terminal. .appendix_sm: Appendix state machine -- combined appendix state machines, or -- None, indicating that there is none. """ assert all( _state_machine_tagged_with_matching_incidence_ids(sm) for sm in SmList) # State machines are not to be transformed at this point in time assert all(not _exists_bad_lexatom_detector_state(sm) for sm in SmList) CaMap.prune(Setup.buffer_encoding.source_set) L = CaMap.union_of_all() L_couple = NumberSet.from_union_of_iterable( sm.get_beginning_character_set() for sm in SmList) # 'plain_list': Transitions to 'normal terminals' # => perform count action and loop. L_plain = L.difference(L_couple) if L_subset is not None: L_plain.intersect_with(L_subset) L_loop = L_plain.union(L_couple) L_exit = L_loop.get_complement(Setup.buffer_encoding.source_set) plain_list = _get_LoopMapEntry_list_plain(loop_config, CaMap, L_plain) exit_list = [] if not L_exit.is_empty(): exit_list.append( LoopMapEntry(L_exit, IidLoopExit, Code=loop_config.cmd_list_CA_GotoTerminal( None, IidLoopExit))) # 'couple_list': Transitions to 'couple terminals' # => connect to appendix state machines couple_list, \ combined_appendix_sm_list, \ appendix_cmd_list_db = parallel_state_machines.do(loop_config, CaMap, SmList) assert L_couple.is_equal( NumberSet.from_union_of_iterable(lei.character_set for lei in couple_list)) result = LoopMap( couple_list, # --> jump to appendix sm-s plain_list, # --> iterate to loop start exit_list) # --> exit loop return result, combined_appendix_sm_list, appendix_cmd_list_db
print "Argument not acceptable, use --hwut-info" sys.exit(0) def build(ISetup): Language = "ANSI-C" txt = create_indentation_handler_code(Language, "<by command line>", ISetup, BufferSize=3) executable_name, \ source = compile(Language, txt, AssertsActionvation_str = "") return executable_name, source pattern_newline = get_Pattern_Prep(DFA.from_character_set(NumberSet( ord('\n')))) pattern_suppressed_newline = get_Pattern_Prep( DFA.from_sequence([ord(x) for x in "\\\n"])) indent_setup = IndentationCount_Pre( SourceRef_VOID, WhiteSpaceCharacterSet=NumberSet([Interval(ord(x)) for x in " :"]), BadSpaceCharacterSet=None, PatternNewline=pattern_newline, PatternSuppressedNewline=pattern_suppressed_newline, PatternListComment=[]) if "FIRST" in sys.argv or len(sys.argv) <= 2: exe, tmp_file = build(indent_setup) exe = "tmp.c.exe"
def get_trigger_set_union(self): interval_list = [] for trigger_set in self.__db.itervalues(): interval_list.extend(trigger_set.get_intervals()) return NumberSet.from_IntervalList(interval_list)
if lei.appendix_sm_has_transitions_f ] def print_this(AnalyzerList): print "#_[ Print %i analyzer(s) ]______________________________" % len( AnalyzerList) print for i, analyzer in enumerate(AnalyzerList): print "--( %i: init si = %i )-------------------------\n" % ( i, analyzer.init_state_index) print analyzer if encoding == "unicode": NS_A = NumberSet.from_range(ord('A'), ord('A') + 1) NS_B = NumberSet.from_range(ord('B'), ord('B') + 1) NS_C = NumberSet.from_range(ord('C'), ord('C') + 1) NS_D = NumberSet.from_range(ord('D'), ord('D') + 1) NS_E = NumberSet.from_range(ord('E'), ord('E') + 1) else: NS_A = NumberSet.from_range(0x600, 0x601) NS_B = NumberSet.from_range(0x601, 0x602) NS_C = NumberSet.from_range(0x602, 0x603) NS_D = NumberSet.from_range(0x603, 0x604) NS_E = NumberSet.from_range(0x604, 0x605) CA_0 = CountAction(E_CharacterCountType.COLUMN, 5) CA_1 = CountAction(E_CharacterCountType.LINE, 1) CA_2 = CountAction(E_CharacterCountType.GRID, 2) CA_3 = CountAction(E_CharacterCountType.WHITESPACE, 3)
#! /usr/bin/env python import sys import os sys.path.insert(0, os.environ["QUEX_PATH"]) from quex.engine.misc.interval_handling import Interval, NumberSet from quex.constants import INTEGER_MAX all = NumberSet.from_range(-INTEGER_MAX, INTEGER_MAX) if "--hwut-info" in sys.argv: print "NumberSet: Inverse" print "CHOICES: 1, 2, serious;" sys.exit(0) def test(NSet): print "# write output in temporary file: 'tmp'" print "# plot with gnuplot:" print "# > plot \"tmp\" w l" print NSet.gnuplot_string(1) result = NSet.get_complement(all) result.assert_consistency() print result.gnuplot_string(0) if "1" in sys.argv: test(NumberSet([Interval(10,20), Interval(21,30), Interval(50,70), Interval(71,80), Interval(80,81), Interval(82,90), Interval(90,100), Interval(110,130), Interval(150,170), Interval(171,190),
def get_Pattern(ValueList): return Pattern.from_character_set( NumberSet([Interval(ord(x)) for x in ValueList]))
class EncodingTrafoUTF16(EncodingTrafoBySplit): UnchangedRange = 0x10000 def __init__(self): EncodingTrafoBySplit.__init__(self, "utf16", CodeUnitRange=NumberSet.from_range(0, 0x10000)) self.error_range_code_unit0 = NumberSet([ Interval(0x0000, 0xDC00), Interval(0xE000, 0x10000) ]).get_complement(NumberSet_All()) self.error_range_code_unit1 = NumberSet([ Interval(0xDC00, 0xE000) ]).get_complement(NumberSet_All()) def prune(self, number_set): global ForbiddenRange number_set.subtract(ForbiddenRange) number_set.mask(0, 0x110000) def get_interval_sequences(self, Orig): interval_1word, intervals_2word = _get_contigous_intervals(Orig) result = [] if interval_1word is not None: result.append([interval_1word]) if intervals_2word is not None: result.extend( _get_trigger_sequence_for_interval(interval) for interval in intervals_2word ) return result def lexatom_n_per_character(self, CharacterSet): """If all characters in a unicode character set state machine require the same number of bytes to be represented this number is returned. Otherwise, 'None' is returned. RETURNS: N > 0 number of bytes required to represent any character in the given state machine. None characters in the state machine require different numbers of bytes. """ assert isinstance(CharacterSet, NumberSet) interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True) front = interval_list[0].begin # First element of number set back = interval_list[-1].end - 1 # Last element of number set # Determine number of bytes required to represent the first and the # last character of the number set. The number of bytes per character # increases monotonously, so only borders have to be considered. front_chunk_n = len(unicode_to_utf16(front)) back_chunk_n = len(unicode_to_utf16(back)) if front_chunk_n != back_chunk_n: return None else: return front_chunk_n def _plug_encoding_error_detectors(self, sm): """Adorn states with transitions to the 'on_encoding_error' handler if the input value lies beyond the limits. The state machine is an implementation of linear sequences of intervals. Thus, the 'code unit position' can be be determined by the number of transitions from the init state. sm = mini state machine that implements the transition sequences. Bad ranges for code units (a 2 byte): 1st: 0xDC00 - 0xCFFF 2nd: 0x0000 - 0xDBFF, 0xE000 - 0x11000 """ # 'CodeUnit[0]' appears at the init state # (Adapt trigger map before entering the 'on bad lexatom state' init_tm = sm.get_init_state().target_map.get_map() workset = set(init_tm.iterkeys()) for si, trigger_set in init_tm.iteritems(): assert not trigger_set.has_intersection(self.error_range_code_unit0) bad_lexatom_state_index = self._plug_encoding_error_detector_single_state(sm, init_tm) # 'CodeUnit[>0]' appear all at later states done = set([bad_lexatom_state_index]) while workset: si = workset.pop() tm = sm.states[si].target_map.get_map() done.add(si) # Only add bad lexatom detection to state that transit on lexatoms # (Bad lexatom states, btw. do not have transitions) if not tm: continue for trigger_set in tm.itervalues(): assert not trigger_set.has_intersection(self.error_range_code_unit1) workset.update(new_si for new_si in tm.iterkeys() if new_si not in done) tm[bad_lexatom_state_index] = self.error_range_code_unit1 def _plug_encoding_error_detector_single_state(self, sm, target_map): bad_lexatom_state_index = sm.access_bad_lexatom_state() if target_map: target_map[bad_lexatom_state_index] = self.error_range_code_unit0 return bad_lexatom_state_index def adapt_source_and_drain_range(self, LexatomByteN): EncodingTrafoBySplit.adapt_source_and_drain_range(self, LexatomByteN) self.error_range_code_unit0.mask_interval(self.lexatom_range) self.error_range_code_unit1.mask_interval(self.lexatom_range) if LexatomByteN == -1: return elif LexatomByteN >= 2: return else: # if there are less than 2 byte for the lexatoms, then only the # unicode range from 0x00 to 0xFF can be treated. self.source_set.mask(0x00, 0x100)
self.__cursor[k] += 2 if self.__cursor[k] < 8: break else: self.__cursor[k] += 1 if self.__cursor[k] < 3: break self.__cursor[k] = 1 k += 1 return result generator = NumberSetGenerator() all = NumberSet.from_range(-sys.maxint, sys.maxint) # Generate 100 NumberSets number_set_list = [] for i in range(100): result = generator.get() number_set_list.append(generator.get()) def test(N1, Op1, N2, Op2): global number_set_list the_tester = Tester(N1, Op1, N2, Op2) # Permutate all existing intervals against each other count_n = 0 for i, x in enumerate(number_set_list):
def get_unicode_range(): return NumberSet.from_range(0, 0x110000)
def create_random_number_set(): result = NumberSet() for begin, end in create_random_interval_list(False): result.quick_append_interval(Interval(begin, end)) result.clean() return result
def _enter(result, begin, end, target_state_setup): entry = result.get(target_state_setup) if entry is None: result[target_state_setup] = NumberSet.from_range(begin, end) else: entry.quick_append_interval(Interval(begin, end))