コード例 #1
0
ファイル: utf16_state_split.py プロジェクト: xxyzzzq/quex
class EncodingTrafoUTF16(EncodingTrafoBySplit):
    UnchangedRange = 0x10000
    def __init__(self):
        EncodingTrafoBySplit.__init__(self, "utf16", 
                                         CodeUnitRange=NumberSet.from_range(0, 0x10000))
        self.error_range_code_unit0 = NumberSet([
            Interval(0x0000, 0xDC00), Interval(0xE000, 0x10000)
        ]).get_complement(NumberSet_All())
        self.error_range_code_unit1 = NumberSet([
            Interval(0xDC00, 0xE000)
        ]).get_complement(NumberSet_All())

    def prune(self, number_set):
        global ForbiddenRange
        number_set.subtract(ForbiddenRange)
        number_set.mask(0, 0x110000)

    def get_interval_sequences(self, Orig):
        interval_1word, intervals_2word = _get_contigous_intervals(Orig)

        result = []
        if interval_1word is not None:
            result.append([interval_1word])

        if intervals_2word is not None:
            result.extend(
                _get_trigger_sequence_for_interval(interval)
                for interval in intervals_2word
            )
        return result

    def lexatom_n_per_character(self, CharacterSet):
        """If all characters in a unicode character set state machine require the
        same number of bytes to be represented this number is returned.  Otherwise,
        'None' is returned.

        RETURNS:   N > 0  number of bytes required to represent any character in the 
                          given state machine.
                   None   characters in the state machine require different numbers of
                          bytes.
        """
        assert isinstance(CharacterSet, NumberSet)

        interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True)
        front = interval_list[0].begin     # First element of number set
        back  = interval_list[-1].end - 1  # Last element of number set
        # Determine number of bytes required to represent the first and the 
        # last character of the number set. The number of bytes per character
        # increases monotonously, so only borders have to be considered.
        front_chunk_n = len(unicode_to_utf16(front))
        back_chunk_n  = len(unicode_to_utf16(back))
        if front_chunk_n != back_chunk_n: return None
        else:                             return front_chunk_n

    def _plug_encoding_error_detectors(self, sm):
        """Adorn states with transitions to the 'on_encoding_error' handler if the 
        input value lies beyond the limits. The state machine is an implementation
        of linear sequences of intervals. Thus, the 'code unit position' can be 
        be determined by the number of transitions from the init state.

        sm = mini state machine that implements the transition sequences.

        Bad ranges for code units (a 2 byte):
            1st: 0xDC00 - 0xCFFF
            2nd: 0x0000 - 0xDBFF, 0xE000 - 0x11000 
        """
        # 'CodeUnit[0]' appears at the init state
        # (Adapt trigger map before entering the 'on bad lexatom state'
        init_tm = sm.get_init_state().target_map.get_map()
        workset = set(init_tm.iterkeys()) 
        for si, trigger_set in init_tm.iteritems():
            assert not trigger_set.has_intersection(self.error_range_code_unit0)

        bad_lexatom_state_index = self._plug_encoding_error_detector_single_state(sm, init_tm)

        # 'CodeUnit[>0]' appear all at later states
        done = set([bad_lexatom_state_index])
        while workset:
            si = workset.pop()
            tm = sm.states[si].target_map.get_map()
            done.add(si)

            # Only add bad lexatom detection to state that transit on lexatoms
            # (Bad lexatom states, btw. do not have transitions)
            if not tm: continue

            for trigger_set in tm.itervalues():
                assert not trigger_set.has_intersection(self.error_range_code_unit1)

            workset.update(new_si for new_si in tm.iterkeys() if new_si not in done) 
            tm[bad_lexatom_state_index] = self.error_range_code_unit1

    def _plug_encoding_error_detector_single_state(self, sm, target_map):
        bad_lexatom_state_index = sm.access_bad_lexatom_state()
        if target_map: 
            target_map[bad_lexatom_state_index] = self.error_range_code_unit0
        return bad_lexatom_state_index

    def adapt_source_and_drain_range(self, LexatomByteN):
        EncodingTrafoBySplit.adapt_source_and_drain_range(self, LexatomByteN)
        self.error_range_code_unit0.mask_interval(self.lexatom_range)
        self.error_range_code_unit1.mask_interval(self.lexatom_range)
        if LexatomByteN == -1:
            return
        elif LexatomByteN >= 2: 
            return
        else:
            # if there are less than 2 byte for the lexatoms, then only the 
            # unicode range from 0x00 to 0xFF can be treated.
            self.source_set.mask(0x00, 0x100)
コード例 #2
0
class EncodingTrafoUTF8(EncodingTrafoBySplit):
    def __init__(self):
        drain_set = NumberSet.from_range(0, 0x100)
        EncodingTrafoBySplit.__init__(self, "utf8", CodeUnitRange=drain_set)
        self.UnchangedRange = 0x7F

        self.error_range_byte0 = NumberSet([
            Interval(0b00000000, 0b01111111+1), Interval(0b11000000, 0b11011111+1),
            Interval(0b11100000, 0b11101111+1), Interval(0b11110000, 0b11110111+1),
            Interval(0b11111000, 0b11111011+1), Interval(0b11111100, 0b11111101+1),
        ]).get_complement(NumberSet_All())

        self.error_range_byteN = NumberSet(
            Interval(0b10000000, 0b10111111+1)
        ).get_complement(NumberSet_All())

    def adapt_source_and_drain_range(self, LexatomByteN):
        EncodingTrafoBySplit.adapt_source_and_drain_range(self, LexatomByteN)
        self.error_range_byte0.mask_interval(self.lexatom_range)
        self.error_range_byteN.mask_interval(self.lexatom_range)

    def prune(self, X):
        pass

    def get_interval_sequences(self, Orig):
        """Orig = Unicode Trigger Set. It is transformed into a sequence of intervals
        that cover all elements of Orig in a representation as UTF8 code units.
        A transition from state '1' to state '2' on 'Orig' is then equivalent to 
        the transitions along the code unit sequence.
        """
        db = _split_by_transformed_sequence_length(Orig)
        if db is None: return []

        result = []
        for seq_length, interval in db.items():
            interval_list = _get_contiguous_interval_sequences(interval, seq_length)
            result.extend(
                _get_trigger_sequence_for_contigous_byte_range_interval(interval, seq_length)
                for interval in interval_list)
        return result

    def lexatom_n_per_character(self, CharacterSet):
        """If all characters in a unicode character set state machine require the
        same number of bytes to be represented this number is returned.  Otherwise,
        'None' is returned.

        RETURNS:   N > 0  number of bytes required to represent any character in the 
                          given state machine.
                   None   characters in the state machine require different numbers of
                          bytes.
        """
        assert isinstance(CharacterSet, NumberSet)

        interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True)
        front = interval_list[0].begin     # First element of number set
        back  = interval_list[-1].end - 1  # Last element of number set
        # Determine number of bytes required to represent the first and the 
        # last character of the number set. The number of bytes per character
        # increases monotonously, so only borders have to be considered.
        front_chunk_n = len(unicode_to_utf8(front))
        back_chunk_n  = len(unicode_to_utf8(back))
        if front_chunk_n != back_chunk_n: return None
        else:                             return front_chunk_n

    def _plug_encoding_error_detectors(self, sm):
        """Adorn states with transitions to the 'on_encoding_error' handler if the 
        input value lies beyond the limits. The state machine is an implementation
        of linear sequences of intervals. Thus, the 'byte position' can be 
        be determined by the number of transitions from the init state.

        sm = mini state machine that implements the transition sequences.

        UTF8 Encodings in binary look like the following (see 'man utf8').

            1 byte: 0xxxxxxx
            2 byte: 110xxxxx 10xxxxxx
            3 byte: 1110xxxx 10xxxxxx 10xxxxxx
            4 byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            5 byte: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
            6 byte: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxxx

        The resulting byte ranges can be observed in 'error_range_byte0' for Byte[0]
        and 'error_range_byteN' for Byte[>0].
        """
        # 'Byte[0]' appears at the init state
        # (Adapt trigger map before entering the 'on bad lexatom state'
        init_tm = sm.get_init_state().target_map.get_map()
        workset = set(init_tm.iterkeys()) 
        for si, trigger_set in init_tm.iteritems():
            assert not trigger_set.has_intersection(self.error_range_byte0)

        bad_lexatom_state_index = self._plug_encoding_error_detector_single_state(sm, init_tm)

        # 'Byte[>0]' appear all at later states
        done = set([bad_lexatom_state_index])
        while workset:
            si = workset.pop()
            tm = sm.states[si].target_map.get_map()
            done.add(si)

            # Only add bad lexatom detection to state that transit on lexatoms
            # (Bad lexatom states, btw. do not have transitions)
            if not tm: continue

            for trigger_set in tm.itervalues():
                assert not trigger_set.has_intersection(self.error_range_byteN)
            workset.update(new_si for new_si in tm.iterkeys() if new_si not in done) 
            tm[bad_lexatom_state_index] = self.error_range_byteN

    def _plug_encoding_error_detector_single_state(self, sm, target_map):
        bad_lexatom_state_index = sm.access_bad_lexatom_state()
        if target_map: 
            target_map[bad_lexatom_state_index] = self.error_range_byte0
        return bad_lexatom_state_index