Ejemplo n.º 1
0
    def __sm_newline_default(self):
        """Default newline: '(\n)|(\r\n)'
        """
        global cc_type_name_db

        newline_set = NumberSet(ord('\n'))
        retour_set  = NumberSet(ord('\r'))

        before = self.specifier_count_op_map.find_occupier(newline_set, set())
        if before is not None:
            error.warning("Trying to implement default newline: '\\n' or '\\r\\n'.\n" 
                          "The '\\n' option is not possible, since it has been occupied by '%s'.\n" \
                          "No newline can be defined by default."
                          % cc_type_name_db[before.cc_type], before.sr, 
                          SuppressCode=NotificationDB.warning_default_newline_0A_impossible)
            # In this case, no newline can be defined!
            return

        sm = StateMachine.from_character_set(newline_set)

        if Setup.dos_carriage_return_newline_f:
            before = self.specifier_count_op_map.find_occupier(retour_set, set())
            if before is not None:
                error.warning("Trying to implement default newline: '\\n' or '\\r\\n'.\n" 
                          "The '\\r\\n' option is not possible, since '\\r' has been occupied by '%s'." \
                          % cc_type_name_db[before.cc_type],
                          before.sr, 
                          SuppressCode=NotificationDB.warning_default_newline_0D_impossible)
            else:
                sm.add_transition_sequence(sm.init_state_index, [retour_set, newline_set])

        return sm
Ejemplo n.º 2
0
    def __init__(self):
        error_range_0 = NumberSet([
            Interval(0b00000000, 0b01111111 + 1),
            Interval(0b11000000, 0b11011111 + 1),
            Interval(0b11100000, 0b11101111 + 1),
            Interval(0b11110000, 0b11110111 + 1),
            Interval(0b11111000, 0b11111011 + 1),
            Interval(0b11111100, 0b11111101 + 1),
        ]).get_complement(NumberSet_All())  # Adapted later

        error_range_N = NumberSet(Interval(0b10000000, 0b10111111+1)) \
                        .get_complement(NumberSet_All()) # Adapted later

        error_range_by_code_unit_db = {
            0: error_range_0,
            1: error_range_N,
            2: error_range_N,
            3: error_range_N,
            4: error_range_N,
            5: error_range_N,
            6: error_range_N,
            7: error_range_N,
            8: error_range_N
        }

        EncodingTrafoBySplit.__init__(self, "utf8",
                                      error_range_by_code_unit_db)
        self.UnchangedRange = 0x7F
Ejemplo n.º 3
0
def __indentation_add(Info):
    # (0) If all involved counts are single spaces, the 'counting' can be done
    #     easily by subtracting 'end - begin', no adaption.
    indent_txt = " " * 16
    if Info.homogeneous_spaces():
        return ""

    def __do(txt, CharSet, Content):
        txt.append(indent_txt + "if( ")
        __condition(txt, CharSet)
        txt.append(" ) { ")
        txt.append(Content)
        txt.append(" }\\\n")

    txt       = []
    spaces_db = {} # Sort same space counts together
    grid_db   = {} # Sort same grid counts together
    for name, count_parameter in Info.count_db.items():
        count         = count_parameter.get()
        character_set = Info.character_set_db[name].get()
        if count == "bad": continue
        # grid counts are indicated by negative integer for count.
        if count >= 0:
            spaces_db.setdefault(count, NumberSet()).unite_with(character_set)
        else:
            grid_db.setdefault(count, NumberSet()).unite_with(character_set)

    for count, character_set in spaces_db.items():
        __do(txt, character_set, "(I) += %i;" % count)

    for count, character_set in grid_db.items():
        __do(txt, character_set, "(I) += (%i - ((I) %% %i));" % (abs(count), abs(count)))

    return "".join(txt)
Ejemplo n.º 4
0
    def add_transition(self, Trigger, TargetStateIdx): 
        """Adds a transition according to trigger and target index.
           RETURNS: The target state index (may be created newly).
        """
        assert type(TargetStateIdx) == long \
               or TargetStateIdx is None \
               or TargetStateIdx in E_StateIndices, "%s" % TargetStateIdx.__class__.__name__
        assert Trigger.__class__ in (int, long, list, Interval, NumberSet) or Trigger is None

        if Trigger is None: # This is a shorthand to trigger via the remaining triggers
            Trigger = self.get_trigger_set_union().get_complement(Setup.buffer_encoding.source_set)
        elif type(Trigger) == long: Trigger = Interval(int(Trigger), int(Trigger+1))
        elif type(Trigger) == int:  Trigger = Interval(Trigger, Trigger+1)
        elif type(Trigger) == list: Trigger = NumberSet(Trigger, ArgumentIsYoursF=True)

        if Trigger.__class__ == Interval:  
            if self.__db.has_key(TargetStateIdx): 
                self.__db[TargetStateIdx].add_interval(Trigger)
            else:
                self.__db[TargetStateIdx] = NumberSet(Trigger, ArgumentIsYoursF=True)
        else:
            if self.__db.has_key(TargetStateIdx): 
                self.__db[TargetStateIdx].unite_with(Trigger)
            else:
                self.__db[TargetStateIdx] = Trigger

        return TargetStateIdx
Ejemplo n.º 5
0
def do(BufferCodecName, BufferCodecFileName=""):
    from quex.engine.state_machine.transformation.base import EncodingTrafoUnicode
    from quex.engine.state_machine.transformation.table import EncodingTrafoByTable
    from quex.engine.state_machine.transformation.utf8_state_split import EncodingTrafoUTF8
    from quex.engine.state_machine.transformation.utf16_state_split import EncodingTrafoUTF16

    if BufferCodecName == "utf8":
        return EncodingTrafoUTF8()

    elif BufferCodecName == "utf16":
        return EncodingTrafoUTF16()

    elif BufferCodecFileName:
        os.path.splitext(os.path.basename(BufferCodecFileName))
        try:
            os.path.splitext(os.path.basename(BufferCodecFileName))
        except:
            error.log("cannot interpret string following '--codec-file'")
        return EncodingTrafoByTable(FileName=BufferCodecFileName)

    elif BufferCodecName == "unicode":
        # (Still, 'icu' or 'iconv' may provide converted content, but ...)
        # If the internal buffer is 'unicode', then the pattern's state
        # machines are not converted. The requirement for the pattern's
        # range is the same as for the 'buffer element chunks'.
        return EncodingTrafoUnicode(NumberSet(Interval(0, 0x110000)),
                                    NumberSet(Interval(0, 0x110000)))

    elif BufferCodecName == "unit-test":
        return EncodingTrafoUnicode(NumberSet_All(), NumberSet_All())

    else:
        return EncodingTrafoByTable(BufferCodecName)
Ejemplo n.º 6
0
def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence):
    script_list = [
        "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille",
        "Buginese", "Buhid", "Canadian_Aboriginal", "Cherokee", "Common",
        "Cuneiform", "Cypriot", "Deseret", "Gothic", "Greek", "Hanunoo",
        "Hebrew", "Hiragana", "Inherited", "Kannada", "Han", "Katakana",
        "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B",
        "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya",
        "Ogham", "Old_Italic", "Old_Persian", "Phoenician", "Shavian",
        "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil",
        "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi"
    ]
    sets = [X(name) for name in script_list]

    orig = combination.do(map(lambda x: x.sm, sets))
    state_n_before, result = transform(Trafo, orig)

    # print result.get_graphviz_string(Option="hex")

    for set in sets:
        set.check(result, unicode_to_transformed_sequence)
    print "Translated %i groups without abortion on error (OK)" % len(sets)

    union = NumberSet()
    for nset in map(lambda set: set.charset, sets):
        union.unite_with(nset)

    inverse_union = NumberSet(Interval(0, 0x110000))
    inverse_union.subtract(union)
    # print inverse_union.get_string(Option="hex")
    check_negative(result,
                   inverse_union.get_intervals(PromiseToTreatWellF=True),
                   unicode_to_transformed_sequence)
Ejemplo n.º 7
0
    def load_UnicodeData(self):
        fh = open_data_base_file("UnicodeData.txt")

        # some rows contain aliases, so they need to get converted into values
        property_general_category = self.db["gc"]
        property_bidi_class = self.db["bc"]

        def convert(Property, ValueAlias):
            """Convert specified ValueAlias to Value of the given property."""
            if Property.alias_to_name_map.has_key(ValueAlias):
                return Property.alias_to_name_map[ValueAlias]
            return ValueAlias

        names_db = {}
        general_category_db = {}
        bidi_class_db = {}
        numeric_value_db = {}
        names_uc1_db = {}
        iso_comment_db = {}

        for line in fh.readlines():
            if line.find("#") != -1: line = line[:line.find("#")]
            if line == "" or line.isspace(): continue

            x = line.split(";")

            code_point = int("0x" + x[0].strip(), 16)  # CodePointIdx       = 0
            name = x[1].strip().replace(" ", "_")  # NameIdx            = 1
            general_category = x[2].strip().replace(
                " ", "_")  # GeneralCategoryIdx = 2
            general_category = convert(property_general_category,
                                       general_category)
            bidi_class = x[4].strip().replace(" ",
                                              "_")  # BidiClassIdx       = 4
            bidi_class = convert(property_bidi_class, bidi_class)
            numeric_value = x[6].strip()  # NumericValueIdx    = 6
            uc1_name = x[10].strip().replace(" ",
                                             "_")  # NameUC1Idx         = 10
            iso_comment = x[11].strip().replace(" ",
                                                "_")  # ISO_CommentIdx     = 11

            names_db[name] = code_point
            general_category_db.setdefault(
                general_category, NumberSet()).quick_append_value(code_point)
            bidi_class_db.setdefault(
                bidi_class, NumberSet()).quick_append_value(code_point)
            numeric_value_db.setdefault(
                numeric_value, NumberSet()).quick_append_value(code_point)
            names_uc1_db[uc1_name] = code_point
            iso_comment_db[iso_comment] = str(code_point)

        self.db["na"].code_point_db = names_db  # Name
        self.db["gc"].code_point_db = general_category_db  # General Category
        self.db["bc"].code_point_db = bidi_class_db  # BidiClass
        self.db["nv"].code_point_db = numeric_value_db  # Numeric Value
        self.db["na1"].code_point_db = names_uc1_db  # Name Unicode 1
        self.db["isc"].code_point_db = iso_comment_db  # ISO_Comment
Ejemplo n.º 8
0
 def __init__(self):
     EncodingTrafoBySplit.__init__(self, "utf16", 
                                      CodeUnitRange=NumberSet.from_range(0, 0x10000))
     self.error_range_code_unit0 = NumberSet([
         Interval(0x0000, 0xDC00), Interval(0xE000, 0x10000)
     ]).get_complement(NumberSet_All())
     self.error_range_code_unit1 = NumberSet([
         Interval(0xDC00, 0xE000)
     ]).get_complement(NumberSet_All())
Ejemplo n.º 9
0
def prepare(A_list, B_list):
    A = NumberSet()
    B = NumberSet()
    for begin, end in A_list:
        A.add_interval(Interval(begin, end))
    for begin, end in B_list:
        B.add_interval(Interval(begin, end))

    A.assert_consistency()
    B.assert_consistency()
    return A, B
Ejemplo n.º 10
0
    def __wildcard_value_match(self, WildCardValue):
        result = NumberSet()

        value_list = self.get_wildcard_value_matches(WildCardValue)
        if len(value_list) == 0:
            return None

        for value in value_list:
            result.unite_with(NumberSet(self.code_point_db[value]))

        # No decoupling, since result is computed each fresh and new
        return result
Ejemplo n.º 11
0
    def __init__(self):
        drain_set = NumberSet.from_range(0, 0x100)
        EncodingTrafoBySplit.__init__(self, "utf8", CodeUnitRange=drain_set)
        self.UnchangedRange = 0x7F

        self.error_range_byte0 = NumberSet([
            Interval(0b00000000, 0b01111111+1), Interval(0b11000000, 0b11011111+1),
            Interval(0b11100000, 0b11101111+1), Interval(0b11110000, 0b11110111+1),
            Interval(0b11111000, 0b11111011+1), Interval(0b11111100, 0b11111101+1),
        ]).get_complement(NumberSet_All())

        self.error_range_byteN = NumberSet(
            Interval(0b10000000, 0b10111111+1)
        ).get_complement(NumberSet_All())
Ejemplo n.º 12
0
    def __init__(self):
        # A character in UTF16 is at maximum represented by two code units.
        # => Two error ranges.
        error_range_0 = NumberSet([
            Interval(0x0000, 0xDC00),
            Interval(0xE000, 0x10000)
        ]).get_complement(NumberSet_All())  # Adapted later

        error_range_1 = NumberSet([Interval(0xDC00, 0xE000)]).get_complement(
            NumberSet_All())  # Adapted later

        error_range_by_code_unit_db = {0: error_range_0, 1: error_range_1}

        EncodingTrafoBySplit.__init__(self, "utf16",
                                      error_range_by_code_unit_db)
Ejemplo n.º 13
0
 def test(UC):
     global trafo_cp037
     x = NumberSet(UC)
     y = x.clone()
     x.transform_by_table(trafo_cp037)
     x.assert_consistency()
     print "0x%02X --> 0x%s" % (UC, x.get_string(Option="hex"))
Ejemplo n.º 14
0
def general_checks(loop_map, appendix_sm_list):

    print "#_[ Checks ]__________________________________________________"
    print
    print "character sets do not intersect",
    all_set = NumberSet()
    for lei in loop_map:
        assert lei.character_set is not None
        assert not lei.character_set.has_intersection(all_set)
        all_set.unite_with(lei.character_set)
    print "[ok]"

    print "count actions do not appear more than once",
    count_action_couple_set = set()
    count_action_plain_set = set()
    appendix_sm_id_set = set()
    print "[ok]"

    ## if "Split" in sys.argv or "Plain" in sys.argv:
    ##     list_id_set = set(sm.get_id() for sm in appendix_sm_list)
    ##     assert appendix_sm_id_set == list_id_set
    ##     print "appendix sm-ids are the same in loop map and sm list: [ok]"
    print "exit character set exits: [%s]" % any(lei.aux_count_action is None
                                                 for lei in loop_map)

    print
Ejemplo n.º 15
0
def general_checks(loop_map, appendix_sm_list):
    print "#_[ Checks ]__________________________________________________"
    print
    print "character sets do not intersect",
    all_set = NumberSet()
    for lei in loop_map:
        assert lei.character_set is not None
        assert not lei.character_set.has_intersection(all_set)
        all_set.unite_with(lei.character_set)
    print "[ok]"

    print "count actions do not appear more than once",
    count_action_couple_set = set()
    count_action_plain_set = set()
    exit_exists_f = False
    appendix_sm_id_set = set()
    for lei in loop_map:
        if lei.count_action is None:
            assert lei.appendix_sm_id is None
            exit_exists_f = True
        elif lei.appendix_sm_id is None:
            assert lei.incidence_id not in count_action_plain_set
            count_action_plain_set.add(lei.incidence_id)
        else:
            assert lei.incidence_id not in count_action_couple_set
            count_action_couple_set.add(lei.incidence_id)
            appendix_sm_id_set.add(lei.appendix_sm_id)
    print "[ok]"
    list_id_set = set(sm.get_id() for sm in appendix_sm_list)
    assert appendix_sm_id_set == list_id_set
    print "appendix sm-ids are the same in loop map and sm list: [ok]"
    print "exit character set exits: [%s]" % exit_exists_f

    print
Ejemplo n.º 16
0
    def get(self):
        # Transform 'cursor' into a number set
        result = NumberSet()
        K = len(self.__cursor)
        if K == 0: return None
        k = 0
        end = 0
        while k < K - 1:
            begin = end + self.__cursor[k]
            end = begin + self.__cursor[k + 1]
            if end > self.N:
                self.__cursor.pop()
                K -= 1
                break
            if begin != end:
                result.quick_append_interval(Interval(begin, end))
            k += 2

        # Increment cursor
        k = 0
        while k < K:
            if k == 0:
                self.__cursor[k] += 2
                if self.__cursor[k] < 8:
                    break
            else:
                self.__cursor[k] += 1
                if self.__cursor[k] < 3:
                    break
            self.__cursor[k] = 1
            k += 1

        return result
Ejemplo n.º 17
0
    def __whitespace_default(self):
        """Try to define default whitespace ' ' or '\t' if their positions
        are not yet occupied in the count_command_map.
        """
        cs0 = NumberSet(ord(" "))
        cs1 = NumberSet(ord("\t"))
        result = NumberSet()
        if not self.specifier_count_op_map.find_occupier(cs0, set()):
            result.unite_with(cs0)
        if not self.specifier_count_op_map.find_occupier(cs1, set()):
            result.unite_with(cs1)

        if result.is_empty():
            error.log("Trying to implement default whitespace ' ' or '\\t' failed.\n"
                      "Characters are occupied by other elements.", self.sr)
        return result
Ejemplo n.º 18
0
 def get_trigger_set_to_target(self, TargetIdx):
     """Returns all triggers that lead to target 'TargetIdx'. If a trigger 'None' is returned
        it means that the epsilon transition triggers to target state. If the TargetIndex is 
        omitted the set of all triggers, except the epsilon triggers, are returned.
     """
     if self.__db.has_key(TargetIdx): return self.__db[TargetIdx]
     else:                            return NumberSet()
Ejemplo n.º 19
0
def LineColumnCount_Default():
    global _LineColumnCount_Default

    if _LineColumnCount_Default is None:
        specifier = SpecifierCountActionMap()
        specifier.add(NumberSet(ord('\n')), "newline", 1, SourceRef_DEFAULT)
        specifier.add(NumberSet(ord('\t')), "grid",    4, SourceRef_DEFAULT)
        specifier.define_else("space",   1, SourceRef_DEFAULT)    # Define: "\else"
        count_command_map = specifier.finalize(
            Setup.buffer_codec.source_set.minimum(), 
            Setup.buffer_codec.source_set.supremum(),             # Apply:  "\else"
            SourceRef_DEFAULT) 

        _LineColumnCount_Default = LineColumnCount(SourceRef_DEFAULT, 
                                                   count_command_map)

    return _LineColumnCount_Default
Ejemplo n.º 20
0
def do(section_list, fh):
    """Parses a codec information file. The described codec can only be
    a 'static character length' encoding. That is every character in the
    code occupies the same number of bytes.

    RETURNS: [0] Set of characters in unicode which are covered by the
                 described codec.
             [1] Range of values in the codec elements.
    """
    source_set = NumberSet()
    drain_set = NumberSet()

    error_str = None

    try:
        while error_str is None:
            skip_whitespace(fh)
            source_begin = read_integer(fh)
            if source_begin is None:
                error_str = "Missing integer (source interval begin) in codec file."
                continue

            skip_whitespace(fh)
            source_size = read_integer(fh)
            if source_size is None:
                error_str = "Missing integer (source interval size) in codec file."
                continue

            skip_whitespace(fh)
            target_begin = read_integer(fh)
            if target_begin is None:
                error_str = "Missing integer (target interval begin) in codec file."
                continue

            source_end = source_begin + source_size
            list.append(section_list, [source_begin, source_end, target_begin])

            source_set.add_interval(Interval(source_begin, source_end))
            drain_set.add_interval(
                Interval(target_begin, target_begin + source_size))

    except EndOfStreamException:
        pass

    return source_set, drain_set, error_str
Ejemplo n.º 21
0
 def __get_remaining_set(self):
     ignored = (E_CharacterCountType.BAD, 
                E_CharacterCountType.BEGIN_NEWLINE_SUPPRESSOR, 
                E_CharacterCountType.BEGIN_NEWLINE, 
                E_CharacterCountType.END_NEWLINE) 
     result  = NumberSet()
     for character_set, info in self.__map:
         if info.cc_type in ignored: continue
         result.unite_with(character_set)
     return result.get_complement(Setup.buffer_codec.source_set)
Ejemplo n.º 22
0
 def __init__(self, SourceSet, Name="unicode"):
     # Plain 'Unicode' associates a character with a single code unit, i.e.
     # its 'code point'.
     # => Only code unit '0' is specified and everything is allowed.
     #    ('everything allowed' is disputable, since certain ranges are
     #     disallowed.)
     assert Name in ("unicode", "utf32")
     error_range_by_code_unit_db = {0: NumberSet()}
     EncodingTrafo.__init__(self, Name, SourceSet,
                            error_range_by_code_unit_db)
Ejemplo n.º 23
0
Archivo: base.py Proyecto: xxyzzzq/quex
 def do_Number(self, number):
     """RETURNS: A interval sequence that implements the given number.
     """
     result = self.do_NumberSet(NumberSet(number))
     # result: list of interval sequences that implement number set.
     # number set contains only one element.
     # => one interval sequence and 
     #    the interval size must be one.
     if result is None: return None
     else:              return result[0]
Ejemplo n.º 24
0
    def _assert_consistency(self):
        assert not any(lme is None for lme in self)
        assert not any(lme.character_set is None for lme in self)
        assert not any((lme.iid_couple_terminal is None) and (lme.code is None)
                       for lme in self)

        # Assert: Transition triggers do not intersect!
        total = NumberSet()
        for lme in self:
            assert not lme.character_set.has_intersection(total)
            total.unite_with(lme.character_set)
Ejemplo n.º 25
0
def verify(A, TrafoInfo):
    result = NumberSet()
    for interval in A.get_intervals():
        for x in range(interval.begin, interval.end):
            for source_begin, source_end, target_begin in TrafoInfo:
                if x >= source_begin and x < source_end:
                    offset = x - source_begin
                    y = target_begin + offset
                    result.add_interval(Interval(y))
    result.assert_consistency()
    return result
Ejemplo n.º 26
0
 def test(X):
     print "#_______________________________________________"
     nset  = NumberSet([ Interval(x, y) for x, y in X])
     clone = nset.clone()
     print "#NumberSet:         %s" % nset
     result = nset.clone()
     result.complement(all)
     print "#NumberSet.inverse: %s" % result
     assert result.is_equal(nset.get_complement(all))
     assert result.intersection(nset).is_empty()
     assert result.union(nset).is_all()
Ejemplo n.º 27
0
    def load_Composition_Exclusion(self):
        # Column 0 contains what is interesting ...
        table = parse_table("CompositionExclusions.txt", NumberColumnList=[0])

        number_set = NumberSet()
        for row in table:
            begin = row[0]
            number_set.quick_append_interval(Interval(begin, begin + 1))
        number_set.clean()

        self.db["CE"].code_point_db = number_set
Ejemplo n.º 28
0
 def get_number_set(Cursor):
     if len(Cursor) == 2:
         return S_None
     cursor = copy(Cursor)
     cursor.pop(0) # element 0 and '-1' are just helping values, no interval borders.
     result = []
     while len(cursor) != 1:
         begin = cursor.pop(0)
         end   = cursor.pop(0)
         result.append(Interval(begin, end))
     return NumberSet(result)
Ejemplo n.º 29
0
def CounterSetupLineColumn_Default():
    global _CounterSetupLineColumn_Default

    if _CounterSetupLineColumn_Default is None:
        count_command_map = CountOpMap()
        count_command_map.add(NumberSet(ord('\n')), "newline", 1,
                              SourceRef_DEFAULT)
        count_command_map.add(NumberSet(ord('\t')), "grid", 4,
                              SourceRef_DEFAULT)
        count_command_map.define_else("space", 1,
                                      SourceRef_DEFAULT)  # Define: "\else"
        count_command_map.assign_else_count_command(
            Setup.buffer_codec.source_set.minimum(),
            Setup.buffer_codec.source_set.supremum(),  # Apply:  "\else"
            SourceRef_DEFAULT)

        _CounterSetupLineColumn_Default = ParserDataLineColumn(
            SourceRef_DEFAULT, count_command_map)

    return _CounterSetupLineColumn_Default
Ejemplo n.º 30
0
def get_all():
    """RETURNS:

       A state machine that 'eats' absolutely everything, i.e. 


                              .--- \Any ---.
                              |            |
           (0)--- \Any --->(( 0 ))<--------'
    """
    result = StateMachine()

    i = index.get()
    state = State(AcceptanceF=True)
    state.add_transition(NumberSet(Interval(-sys.maxint, sys.maxint)), i)
    result.states[i] = state

    result.get_init_state().add_transition(
        NumberSet(Interval(-sys.maxint, sys.maxint)), i)

    return result