Exemple #1
0
def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence):
    script_list = [
        "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille",
        "Buginese", "Buhid", "Canadian_Aboriginal", "Cherokee", "Common",
        "Cuneiform", "Cypriot", "Deseret", "Gothic", "Greek", "Hanunoo",
        "Hebrew", "Hiragana", "Inherited", "Kannada", "Han", "Katakana",
        "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B",
        "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya",
        "Ogham", "Old_Italic", "Old_Persian", "Phoenician", "Shavian",
        "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil",
        "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi"
    ]
    sets = [X(name) for name in script_list]

    orig = combination.do(map(lambda x: x.sm, sets))
    state_n_before, result = transform(Trafo, orig)

    # print result.get_graphviz_string(Option="hex")

    for set in sets:
        set.check(result, unicode_to_transformed_sequence)
    print "Translated %i groups without abortion on error (OK)" % len(sets)

    union = NumberSet()
    for nset in map(lambda set: set.charset, sets):
        union.unite_with(nset)

    inverse_union = NumberSet(Interval(0, 0x110000))
    inverse_union.subtract(union)
    # print inverse_union.get_string(Option="hex")
    check_negative(result,
                   inverse_union.get_intervals(PromiseToTreatWellF=True),
                   unicode_to_transformed_sequence)
Exemple #2
0
def general_checks(loop_map, appendix_sm_list):
    print "#_[ Checks ]__________________________________________________"
    print
    print "character sets do not intersect",
    all_set = NumberSet()
    for lei in loop_map:
        assert lei.character_set is not None
        assert not lei.character_set.has_intersection(all_set)
        all_set.unite_with(lei.character_set)
    print "[ok]"

    print "count actions do not appear more than once",
    count_action_couple_set = set()
    count_action_plain_set  = set()
    exit_exists_f           = False
    appendix_sm_id_set      = set()
    for lei in loop_map:
        if lei.count_action is None: 
            assert lei.appendix_sm_id is None
            exit_exists_f = True
        elif lei.appendix_sm_id is None:
            assert lei.incidence_id not in count_action_plain_set
            count_action_plain_set.add(lei.incidence_id)
        else:
            assert lei.incidence_id not in count_action_couple_set
            count_action_couple_set.add(lei.incidence_id)
            appendix_sm_id_set.add(lei.appendix_sm_id)
    print "[ok]"
    list_id_set = set(sm.get_id() for sm in appendix_sm_list)
    assert appendix_sm_id_set == list_id_set
    print "appendix sm-ids are the same in loop map and sm list: [ok]"
    print "exit character set exits: [%s]" % exit_exists_f

    print
Exemple #3
0
def general_checks(loop_map, appendix_sm_list):

    print "#_[ Checks ]__________________________________________________"
    print
    print "character sets do not intersect",
    all_set = NumberSet()
    for lei in loop_map:
        assert lei.character_set is not None
        assert not lei.character_set.has_intersection(all_set)
        all_set.unite_with(lei.character_set)
    print "[ok]"

    print "count actions do not appear more than once",
    count_action_couple_set = set()
    count_action_plain_set = set()
    appendix_sm_id_set = set()
    print "[ok]"

    ## if "Split" in sys.argv or "Plain" in sys.argv:
    ##     list_id_set = set(sm.get_id() for sm in appendix_sm_list)
    ##     assert appendix_sm_id_set == list_id_set
    ##     print "appendix sm-ids are the same in loop map and sm list: [ok]"
    print "exit character set exits: [%s]" % any(lei.aux_count_action is None
                                                 for lei in loop_map)

    print
Exemple #4
0
def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence):
    script_list = [
        "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Buginese", "Buhid",
        "Canadian_Aboriginal", "Cherokee", "Common",  "Cuneiform",  "Cypriot",  "Deseret",
        "Gothic",  "Greek",  
        "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Han",  
        "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam",
        "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya", "Ogham", "Old_Italic", "Old_Persian",
        "Phoenician",  "Shavian",  "Syloti_Nagri", 
        "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai",
        "Tibetan", "Tifinagh", "Ugaritic", "Yi"
    ]
    sets = [ X(name) for name in script_list ]

    orig = get_combined_state_machine(map(lambda x: x.sm, sets))
    state_n_before, result = transform(Trafo, orig)

    # print result.get_graphviz_string(Option="hex")

    for set in sets:
        set.check(result, unicode_to_transformed_sequence)
    print "Translated %i groups without abortion on error (OK)" % len(sets)

    union = NumberSet()
    for nset in map(lambda set: set.charset, sets):
        union.unite_with(nset)

    inverse_union = NumberSet(Interval(0, 0x110000))
    inverse_union.subtract(union)
    # print inverse_union.get_string(Option="hex")
    check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True), 
                   unicode_to_transformed_sequence)
Exemple #5
0
def general_checks(loop_map, appendix_sm_list):
    print "#_[ Checks ]__________________________________________________"
    print
    print "character sets do not intersect",
    all_set = NumberSet()
    for lei in loop_map:
        assert lei.character_set is not None
        assert not lei.character_set.has_intersection(all_set)
        all_set.unite_with(lei.character_set)
    print "[ok]"

    print "count actions do not appear more than once",
    count_action_couple_set = set()
    count_action_plain_set = set()
    exit_exists_f = False
    appendix_sm_id_set = set()
    for lei in loop_map:
        if lei.count_action is None:
            assert lei.appendix_sm_id is None
            exit_exists_f = True
        elif lei.appendix_sm_id is None:
            assert lei.incidence_id not in count_action_plain_set
            count_action_plain_set.add(lei.incidence_id)
        else:
            assert lei.incidence_id not in count_action_couple_set
            count_action_couple_set.add(lei.incidence_id)
            appendix_sm_id_set.add(lei.appendix_sm_id)
    print "[ok]"
    list_id_set = set(sm.get_id() for sm in appendix_sm_list)
    assert appendix_sm_id_set == list_id_set
    print "appendix sm-ids are the same in loop map and sm list: [ok]"
    print "exit character set exits: [%s]" % exit_exists_f

    print
Exemple #6
0
 def __get_remaining_set(self):
     ignored = (E_CharacterCountType.BAD, 
                E_CharacterCountType.BEGIN_NEWLINE_SUPPRESSOR, 
                E_CharacterCountType.BEGIN_NEWLINE, 
                E_CharacterCountType.END_NEWLINE) 
     result  = NumberSet()
     for character_set, info in self.__map:
         if info.cc_type in ignored: continue
         result.unite_with(character_set)
     return result.get_complement(Setup.buffer_codec.source_set)
Exemple #7
0
 def get_ending_character_set(self):
     """Returns the union of all characters that trigger to an acceptance
        state in the given state machine. This is to detect whether the
        newline or suppressor end with an indentation character (grid or space).
     """
     result = NumberSet()
     for end_state_index in self.get_acceptance_state_index_list():
         for state in self.states.itervalues():
             if state.target_map.has_target(end_state_index) == False: continue
             result.unite_with(state.target_map.get_trigger_set_to_target(end_state_index))
     return result
Exemple #8
0
    def _assert_consistency(self):
        assert not any(lme is None for lme in self)
        assert not any(lme.character_set is None for lme in self)
        assert not any((lme.iid_couple_terminal is None) and (lme.code is None)
                       for lme in self)

        # Assert: Transition triggers do not intersect!
        total = NumberSet()
        for lme in self:
            assert not lme.character_set.has_intersection(total)
            total.unite_with(lme.character_set)
Exemple #9
0
    def __wildcard_value_match(self, WildCardValue):
        result = NumberSet()

        value_list = self.get_wildcard_value_matches(WildCardValue)
        if len(value_list) == 0: 
            return None

        for value in value_list:
            result.unite_with(NumberSet(self.code_point_db[value]))

        # No decoupling, since result is computed each fresh and new
        return result
Exemple #10
0
    def __wildcard_value_match(self, WildCardValue):
        result = NumberSet()

        value_list = self.get_wildcard_value_matches(WildCardValue)
        if len(value_list) == 0:
            return None

        for value in value_list:
            result.unite_with(NumberSet(self.code_point_db[value]))

        # No decoupling, since result is computed each fresh and new
        return result
Exemple #11
0
 def get_ending_character_set(self):
     """Returns the union of all characters that trigger to an acceptance
        state in the given state machine. This is to detect whether the
        newline or suppressor end with an indentation character (grid or space).
     """
     result = NumberSet()
     for end_state_index in self.get_acceptance_state_index_list():
         for state in self.states.itervalues():
             if state.target_map.has_target(end_state_index) == False:
                 continue
             result.unite_with(
                 state.target_map.get_trigger_set_to_target(
                     end_state_index))
     return result
Exemple #12
0
    def __whitespace_default(self):
        """Try to define default whitespace ' ' or '\t' if their positions
        are not yet occupied in the count_command_map.
        """
        cs0 = NumberSet(ord(" "))
        cs1 = NumberSet(ord("\t"))
        result = NumberSet()
        if not self.specifier_count_op_map.find_occupier(cs0, set()):
            result.unite_with(cs0)
        if not self.specifier_count_op_map.find_occupier(cs1, set()):
            result.unite_with(cs1)

        if result.is_empty():
            error.log("Trying to implement default whitespace ' ' or '\\t' failed.\n"
                      "Characters are occupied by other elements.", self.sr)
        return result
Exemple #13
0
    def is_DFA_compliant(self):
        """Checks if the current state transitions are DFA compliant, i.e. it
           investigates if trigger sets pointing to different targets intersect.
           RETURNS:  True  => OK
                    False => Same triggers point to different target. This cannot
                             be part of a deterministic finite automaton (DFA).
        """
        # DFA's do not have epsilon transitions
        if len(self.__epsilon_target_index_list) != 0: return False

        # check whether trigger sets intersect
        all_trigger_sets = NumberSet()
        for trigger_set in self.__db.itervalues():
            if all_trigger_sets.has_intersection(trigger_set): 
                return False
            else:
                all_trigger_sets.unite_with(trigger_set)

        return True
Exemple #14
0
    def is_DFA_compliant(self):
        """Checks if the current state transitions are DFA compliant, i.e. it
           investigates if trigger sets pointing to different targets intersect.
           RETURNS:  True  => OK
                    False => Same triggers point to different target. This cannot
                             be part of a deterministic finite automaton (DFA).
        """
        # DFA's do not have epsilon transitions
        if len(self.__epsilon_target_index_list) != 0: return False

        # check whether trigger sets intersect
        all_trigger_sets = NumberSet()
        for trigger_set in self.__db.itervalues():
            if all_trigger_sets.has_intersection(trigger_set): 
                return False
            else:
                all_trigger_sets.unite_with(trigger_set)

        return True
            for cmd in result.states[s_idx].single_entry:
                assert not cmd.is_acceptance()

    print " (OK)"

sets = map(lambda name: X(name),
        ["Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille",
            "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada",
            "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam",
            "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Ogham", "Old_Italic", "Old_Persian",
            "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai",
            "Tibetan", "Tifinagh", "Ugaritic", "Yi"])

orig = get_combined_state_machine(map(lambda x: x.sm, sets))
print "Number of states in state machine:"
print "   Unicode:       %i" % len(orig.states)
result = trafo.do(orig)
print "   UTF8-Splitted: %i" % len(result.states)

for set in sets:
    set.check(result)

union = NumberSet()
for nset in map(lambda set: set.charset, sets):
    union.unite_with(nset)

inverse_union = NumberSet(Interval(0, 0x110000))
inverse_union.subtract(union)
# print inverse_union.get_string(Option="hex")
check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True))
Exemple #16
0
    def get_character_set(self, Value=None):
        """Returns the character set that corresponds to 'Property==Value'.
           'Value' can be a property value or a property value alias.
           For binary properties 'Value' must be None.
        """
        assert self.type != "Binary" or Value is None

        def get_value_combination(CmbAlias):
            result = []
            for alias in self.alias_to_alias_combination_db[CmbAlias]:
                name = self.alias_to_name_map.get(alias)
                if name is None:
                    return "Unicode database error: no name related to alias '%s'" % alias
                result.append(name)
            return result

        if self.type != "Binary" and Value is None:
            return "Property '%s' requires a value setting.\n" % self.name + \
                   "Possible Values: " + \
                   self.get_value_list_help()

        if self.code_point_db is None:
            self.init_code_point_db()

        if self.type == "Binary": 
            # Decouple, since we refer to an internal database
            return deepcopy(self.code_point_db)

        adapted_value = Value.replace(" ", "_")

        if   self.code_point_db.has_key(adapted_value): 
            # 'value' is present as name in the code point database
            value = adapted_value

        elif Value in self.alias_to_name_map.keys():
            # 'value' is present as alias in code pointer database
            value = self.alias_to_name_map[adapted_value]

        elif Value in self.alias_to_alias_combination_db.keys():
            # 'value' is present as a combination of aliases
            value = get_value_combination(adapted_value)

        elif self.name_to_alias_map.has_key(adapted_value):
            # The value was a combination of values
            value = get_value_combination(self.name_to_alias_map[adapted_value])

        else:
            # -- WILDCARD MATCH: Results in a list of property values  
            character_set = self.__wildcard_value_match(adapted_value)
            if character_set is None:
                return "Property '%s' cannot have a value or value alias '%s'.\n" % (self.name, Value) + \
                       "Possible Values: " + \
                       self.get_value_list_help()
            # No need to decouple, since character is not a reference to
            # internal database (for safety, do it)
            return deepcopy(character_set)

        if type(value) == list:
            result = NumberSet()
            for element in value:
                if element == "Unassigned": continue
                entry = self.code_point_db.get(element)
                if entry is None:
                    return "%s/%s is not supported by Unicode database." % (self.name, repr(element))
                result.unite_with(entry)
        else:
            result = self.code_point_db.get(value)
            if result is None:
                return "%s/%s is not supported by Unicode database." % (self.name, repr(value))

        # Reference to internal database --> decouple with 'deepcopy'
        return deepcopy(result)
Exemple #17
0
    def covers(self, Min, Max):
        result = NumberSet()

        for info in self.__map:
            result.unite_with(info.character_set)
        return result.covers_range(Min, Max)
Exemple #18
0
def _get_all_character_set(*DbList):
    result = NumberSet()
    for db in DbList:
        for character_set in db.itervalues():
            result.unite_with(character_set)
    return result
Exemple #19
0
def _get_all_character_set(*DbList):
    result = NumberSet()
    for db in DbList:
        for character_set in db.itervalues():
            result.unite_with(character_set)
    return result
Exemple #20
0
    def covers(self, Min, Max):
        result = NumberSet()

        for info in self.__map:
            result.unite_with(info.character_set)
        return result.covers_range(Min, Max)
Exemple #21
0
 def get_trigger_set_union(self):
     result = NumberSet()
     for trigger_set in self.__db.itervalues():
         result.unite_with(trigger_set)
     return result
    "Buhid",
    "Canadian_Aboriginal",
    "Cherokee",
    "Syloti_Nagri",
    "Syriac",
    "Tagalog",
    "Tagbanwa",
    "Tai_Le",
    "Yi",
])

orig = get_combined_state_machine(map(lambda x: x.sm, sets))
print "# Number of states in state machine:"
print "#   Unicode:       %i" % len(orig.states)
result = trafo.do(orig)
print "#   UTF8-Splitted: %i" % len(result.states)

# print result.get_graphviz_string(Option="hex")

for set in sets:
    set.check(result)

union = NumberSet()
for nset in map(lambda set: set.charset, sets):
    union.unite_with(nset)

inverse_union = NumberSet(Interval(0, 0x110000))
inverse_union.subtract(union)
# print inverse_union.get_string(Option="hex")
check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True))
Exemple #23
0
    def get_character_set(self, Value=None):
        """Returns the character set that corresponds to 'Property==Value'.
           'Value' can be a property value or a property value alias.
           For binary properties 'Value' must be None.
        """
        assert self.type != "Binary" or Value is None

        def get_value_combination(CmbAlias):
            result = []
            for alias in self.alias_to_alias_combination_db[CmbAlias]:
                name = self.alias_to_name_map.get(alias)
                if name is None:
                    return "Unicode database error: no name related to alias '%s'" % alias
                result.append(name)
            return result

        if self.type != "Binary" and Value is None:
            return "Property '%s' requires a value setting.\n" % self.name + \
                   "Possible Values: " + \
                   self.get_value_list_help()

        if self.code_point_db is None:
            self.init_code_point_db()

        if self.type == "Binary":
            # Decouple, since we refer to an internal database
            return deepcopy(self.code_point_db)

        adapted_value = Value.replace(" ", "_")

        if self.code_point_db.has_key(adapted_value):
            # 'value' is present as name in the code point database
            value = adapted_value

        elif Value in self.alias_to_name_map.keys():
            # 'value' is present as alias in code pointer database
            value = self.alias_to_name_map[adapted_value]

        elif Value in self.alias_to_alias_combination_db.keys():
            # 'value' is present as a combination of aliases
            value = get_value_combination(adapted_value)

        elif self.name_to_alias_map.has_key(adapted_value):
            # The value was a combination of values
            value = get_value_combination(
                self.name_to_alias_map[adapted_value])

        else:
            # -- WILDCARD MATCH: Results in a list of property values
            character_set = self.__wildcard_value_match(adapted_value)
            if character_set is None:
                return "Property '%s' cannot have a value or value alias '%s'.\n" % (self.name, Value) + \
                       "Possible Values: " + \
                       self.get_value_list_help()
            # No need to decouple, since character is not a reference to
            # internal database (for safety, do it)
            return deepcopy(character_set)

        if type(value) == list:
            result = NumberSet()
            for element in value:
                if element == "Unassigned": continue
                entry = self.code_point_db.get(element)
                if entry is None:
                    return "%s/%s is not supported by Unicode database." % (
                        self.name, repr(element))
                result.unite_with(entry)
        else:
            result = self.code_point_db.get(value)
            if result is None:
                return "%s/%s is not supported by Unicode database." % (
                    self.name, repr(value))

        # Reference to internal database --> decouple with 'deepcopy'
        return deepcopy(result)