コード例 #1
0
    def _validate_utf_8_code_point_consistency(self,
                                               code_point_start_byte_index,
                                               code_point_bytes_count):

        # Make sure that the UTF-8 str has sufficient length to accommodate all the bytes of the
        # code point.
        if code_point_start_byte_index + code_point_bytes_count > len(
                self._raw_utf_8_sequence):
            raise SmartStrException(
                'UTF-8 str format error. Code point at index {} is of size {}, but the total length of the str is {}.'
                .format(code_point_start_byte_index, code_point_bytes_count,
                        len(self._raw_utf_8_sequence)))

        # Validate the format of the trailing bytes of the code point.
        for i in range(code_point_start_byte_index + 1,
                       code_point_start_byte_index + code_point_bytes_count):

            current_trailing_byte = self._raw_utf_8_sequence[i]
            current_trailing_byte_ordinal = ord(current_trailing_byte)

            # Make sure that the trailing bytes of the code point have the correct format
            # (10000000 - 10111111).
            if current_trailing_byte_ordinal < 0x80 or current_trailing_byte_ordinal > 0xBF:
                raise SmartStrException(
                    'UTF-8 str format error. Code point trailing byte has a faulty format (byte at index {} of size {}, offset {}).'
                    .format(code_point_start_byte_index,
                            code_point_bytes_count,
                            i - code_point_start_byte_index))
コード例 #2
0
    def _validate_utf_16_code_point_consistency(self,
                                                code_point_start_word_index,
                                                code_point_words_count):

        # Make sure that the UTF-16 str has sufficient length to accommodate all the words of the
        # code point.
        if code_point_start_word_index + code_point_words_count > len(
                self._raw_utf_16_sequence):
            raise SmartStrException(
                'UTF-16 str format error. Code point at index {} is of size {}, but the total length of the str is {}.'
                .format(code_point_start_word_index, code_point_words_count,
                        len(self._raw_utf_16_sequence)))

        # If the code point consists of 2 words, verify that those words are a surrogate pair
        # (high surrogate and low surrogate).
        if code_point_words_count == 2:
            # Verify that the first word is a high surrogate.
            first_code_point_word = self._raw_utf_16_sequence[
                code_point_start_word_index]
            if not self._is_utf_16_high_surrogate(first_code_point_word):
                raise SmartStrException(
                    'UTF-16 str format error. Code point consists of 2 words, but the first word is not a high surrogate (word at index {}).'
                    .format(code_point_start_word_index))
            # Verify that the second word is a low surrogate.
            second_code_point_word = self._raw_utf_16_sequence[
                code_point_start_word_index + 1]
            if not self._is_utf_16_low_surrogate(second_code_point_word):
                raise SmartStrException(
                    'UTF-16 str format error. Code point consists of 2 words, but the second word is not a low surrogate (word at index {}).'
                    .format(code_point_start_word_index + 1))
コード例 #3
0
 def _validate_list_input(code_points):
     if len(code_points) > 2:
         raise SmartStrException(
             'The list to initialize the Smart Char object is too long - {} objects.'
             .format(len(code_points)))
     for code_point in code_points:
         if not isinstance(code_point, CodePoint):
             raise SmartStrException(
                 'Illegal type {} of an object in the list used for the Smart Char initialization.'
                 .format(type(code_point)))
コード例 #4
0
ファイル: code_point.py プロジェクト: markk2504/smart-string
    def __init__(self, raw_input):

        if isinstance(raw_input, int):
            # The code point is initialized with a Unicode numeric value.
            self._init_raw_type = StrInitInputType.UNICODE_VAL
            self._raw_utf_8_sequence = None
            self._raw_utf_16_sequence = None
            self._unicode_val = self.get_validated_unicode_val(raw_input)
        elif isinstance(raw_input, str):
            # The code point is initialized with a UTF-8 sequence of bytes ('str').
            self._init_raw_type = StrInitInputType.UTF_8
            self._raw_utf_8_sequence = raw_input
            self._raw_utf_16_sequence = None
            self._unicode_val = self.get_utf_8_code_point_val(raw_input)
        elif isinstance(raw_input, unicode):
            # The code point is initialized with a UTF-16 sequence of words ('unicode').
            self._init_raw_type = StrInitInputType.UTF_16
            self._raw_utf_8_sequence = None
            self._raw_utf_16_sequence = raw_input
            self._unicode_val = self.get_utf_16_code_point_val(raw_input)
        else:
            raise SmartStrException(
                'Illegal Code Point raw input type {}.'.format(
                    type(raw_input)))

        self._code_point_type = self.get_code_point_type(self._unicode_val)
コード例 #5
0
ファイル: code_point.py プロジェクト: markk2504/smart-string
 def get_validated_unicode_val(unicode_val):
     if unicode_val >= 0x0000 and unicode_val <= 0x10FFFF:
         return unicode_val
     else:
         raise SmartStrException(
             'Int value {} is not a valid value for a Unicode code point'.
             format(unicode_val))
コード例 #6
0
    def _process_rew_utf_8_str(self):

        current_byte_idx = 0
        previous_code_point = None

        while current_byte_idx < len(self._raw_utf_8_sequence):

            current_byte = self._raw_utf_8_sequence[current_byte_idx]
            current_byte_ordinal = ord(current_byte)

            if current_byte_ordinal < 0x80:
                # The current byte is in the range 00000000 - 01111111.
                # The actual code point is an ascii char that occupies 1 byte.
                current_code_point_bytes_count = 1
                self._validate_utf_8_code_point_consistency(
                    current_byte_idx, current_code_point_bytes_count)
            elif current_byte_ordinal < 0xE0:
                # The current byte is in the range 11000000 - 11011111.
                current_code_point_bytes_count = 2
                self._validate_utf_8_code_point_consistency(
                    current_byte_idx, current_code_point_bytes_count)
            elif current_byte_ordinal < 0xF0:
                # The current byte is in the range 11100000 - 11101111.
                current_code_point_bytes_count = 3
                self._validate_utf_8_code_point_consistency(
                    current_byte_idx, current_code_point_bytes_count)
            elif current_byte_ordinal < 0xF8:
                # The current byte is in the range 11110000 - 11110111.
                current_code_point_bytes_count = 4
                self._validate_utf_8_code_point_consistency(
                    current_byte_idx, current_code_point_bytes_count)
            else:
                raise SmartStrException(
                    'UTF-8 str format error. Illegal format for the starting byte of a code point.'
                )

            current_code_point_sequence = self._raw_utf_8_sequence[
                current_byte_idx:current_byte_idx +
                current_code_point_bytes_count]
            current_code_point = CodePoint(current_code_point_sequence)

            current_byte_idx += current_code_point_bytes_count

            last_code_point_in_str = current_byte_idx == len(
                self._raw_utf_8_sequence)
            previous_code_point = self._convert_code_points_to_char(
                previous_code_point, current_code_point,
                last_code_point_in_str)
コード例 #7
0
ファイル: code_point.py プロジェクト: markk2504/smart-string
    def get_utf_16_code_point_val(utf_16_raw_sequence):

        utf_16_raw_sequence_len = len(utf_16_raw_sequence)
        if utf_16_raw_sequence_len == 1:
            code_point_val = ord(utf_16_raw_sequence)
        elif utf_16_raw_sequence_len == 2:
            high_surrogate = utf_16_raw_sequence[0]
            low_surrogate = utf_16_raw_sequence[1]
            code_point_val = (0x10000 +
                              ((ord(high_surrogate) - 0xD800) * 0x400) +
                              (ord(low_surrogate) - 0xDC00))
        else:
            raise SmartStrException(
                'UTF-16 sequence of length {} does not represent a valid code point.'
                .format(len(utf_16_raw_sequence)))

        return code_point_val
コード例 #8
0
    def __init__(self, raw_sequence):

        if isinstance(raw_sequence, str):
            self._init_raw_type = StrInitInputType.UTF_8
            self._raw_utf_8_sequence = raw_sequence
            self._raw_utf_16_sequence = None
            self._characters = []
            self._process_rew_utf_8_str()
        elif isinstance(raw_sequence, unicode):
            self._init_raw_type = StrInitInputType.UTF_16
            self._raw_utf_8_sequence = None
            self._raw_utf_16_sequence = raw_sequence
            self._characters = []
            self._process_rew_utf_16_str()
        else:
            raise SmartStrException(
                'Illegal Smart String raw sequence type {}.'.format(
                    type(raw_sequence)))
コード例 #9
0
ファイル: code_point.py プロジェクト: markk2504/smart-string
    def get_utf_16_code_point_sequence(code_point_unicode_val):

        utf_16_code_point_sequence = u''
        if code_point_unicode_val >= 0x0000 and code_point_unicode_val <= 0xFFFF:
            utf_16_code_point_sequence = unichr(code_point_unicode_val)
        elif code_point_unicode_val >= 0x10000 and code_point_unicode_val <= 0x10FFFF:
            high_surrogate = (
                (code_point_unicode_val - 0x10000) / 0x400) + 0xD800
            low_surrogate = (
                (code_point_unicode_val - 0x10000) % 0x400) + 0xDC00
            utf_16_code_point_sequence += unichr(high_surrogate)
            utf_16_code_point_sequence += unichr(low_surrogate)
        else:
            raise SmartStrException(
                'Unicode code point value {} is illegal.'.format(
                    code_point_unicode_val))

        return utf_16_code_point_sequence
コード例 #10
0
ファイル: code_point.py プロジェクト: markk2504/smart-string
    def get_utf_8_code_point_val(utf_8_raw_sequence):

        utf_8_raw_sequence_len = len(utf_8_raw_sequence)
        if utf_8_raw_sequence_len == 1:
            byte_0_numeric_val = ord(utf_8_raw_sequence[0])
            byte_0_effective_val = byte_0_numeric_val & 0b01111111
            code_point_val = byte_0_effective_val
        elif utf_8_raw_sequence_len == 2:
            byte_0_numeric_val = ord(utf_8_raw_sequence[0])
            byte_1_numeric_val = ord(utf_8_raw_sequence[1])
            byte_0_effective_val = byte_0_numeric_val & 0b00011111
            byte_1_effective_val = byte_1_numeric_val & 0b00111111
            code_point_val = (byte_0_effective_val << 6) + byte_1_effective_val
        elif utf_8_raw_sequence_len == 3:
            byte_0_numeric_val = ord(utf_8_raw_sequence[0])
            byte_1_numeric_val = ord(utf_8_raw_sequence[1])
            byte_2_numeric_val = ord(utf_8_raw_sequence[2])
            byte_0_effective_val = byte_0_numeric_val & 0b00001111
            byte_1_effective_val = byte_1_numeric_val & 0b00111111
            byte_2_effective_val = byte_2_numeric_val & 0b00111111
            code_point_val = ((byte_0_effective_val << 12) +
                              (byte_1_effective_val << 6) +
                              byte_2_effective_val)
        elif utf_8_raw_sequence_len == 4:
            byte_0_numeric_val = ord(utf_8_raw_sequence[0])
            byte_1_numeric_val = ord(utf_8_raw_sequence[1])
            byte_2_numeric_val = ord(utf_8_raw_sequence[2])
            byte_3_numeric_val = ord(utf_8_raw_sequence[3])
            byte_0_effective_val = byte_0_numeric_val & 0b00000111
            byte_1_effective_val = byte_1_numeric_val & 0b00111111
            byte_2_effective_val = byte_2_numeric_val & 0b00111111
            byte_3_effective_val = byte_3_numeric_val & 0b00111111
            code_point_val = ((byte_0_effective_val << 18) +
                              (byte_1_effective_val << 12) +
                              (byte_2_effective_val << 6) +
                              byte_3_effective_val)
        else:
            raise SmartStrException(
                'UTF-8 sequence of length {} does not represent a valid code point.'
                .format(utf_8_raw_sequence_len))

        return code_point_val
コード例 #11
0
ファイル: code_point.py プロジェクト: markk2504/smart-string
    def get_utf_8_code_point_sequence(code_point_unicode_val):

        utf_8_code_point_sequence = ''
        if code_point_unicode_val >= 0x0000 and code_point_unicode_val <= 0x007F:
            byte_0_numeric_val = code_point_unicode_val
            utf_8_code_point_sequence = chr(byte_0_numeric_val)
        elif code_point_unicode_val >= 0x0080 and code_point_unicode_val <= 0x07FF:
            byte_0_numeric_val = 0b11000000 | (code_point_unicode_val >> 6)
            byte_1_numeric_val = 0b10000000 | (code_point_unicode_val
                                               & 0b00111111)
            utf_8_code_point_sequence += chr(byte_0_numeric_val)
            utf_8_code_point_sequence += chr(byte_1_numeric_val)
        elif code_point_unicode_val >= 0x0800 and code_point_unicode_val <= 0xFFFF:
            byte_0_numeric_val = 0b11100000 | (code_point_unicode_val >> 12)
            byte_1_numeric_val = 0b10000000 | (
                (code_point_unicode_val >> 6) & 0b00111111)
            byte_2_numeric_val = 0b10000000 | (code_point_unicode_val
                                               & 0b00111111)
            utf_8_code_point_sequence += chr(byte_0_numeric_val)
            utf_8_code_point_sequence += chr(byte_1_numeric_val)
            utf_8_code_point_sequence += chr(byte_2_numeric_val)
        elif code_point_unicode_val >= 0x10000 and code_point_unicode_val <= 0x10FFFF:
            byte_0_numeric_val = 0b11110000 | (code_point_unicode_val >> 18)
            byte_1_numeric_val = 0b10000000 | (
                (code_point_unicode_val >> 12) & 0b00111111)
            byte_2_numeric_val = 0b10000000 | (
                (code_point_unicode_val >> 6) & 0b00111111)
            byte_3_numeric_val = 0b10000000 | (code_point_unicode_val
                                               & 0b00111111)
            utf_8_code_point_sequence += chr(byte_0_numeric_val)
            utf_8_code_point_sequence += chr(byte_1_numeric_val)
            utf_8_code_point_sequence += chr(byte_2_numeric_val)
            utf_8_code_point_sequence += chr(byte_3_numeric_val)
        else:
            raise SmartStrException(
                'Unicode code point value {} is illegal.'.format(
                    code_point_unicode_val))

        return utf_8_code_point_sequence
コード例 #12
0
 def __getitem__(self, i):
     if i < 0 or i > len(self._characters) - 1:
         raise SmartStrException(
             'Illegal index {} for the Smart String subscript oparator.'.
             format(i))
     return self.characters[i]
コード例 #13
0
 def _validate_non_list_input(code_point):
     if not isinstance(code_point, CodePoint):
         raise SmartStrException(
             'Illegal type {} for the Smart Char initialization.'.format(
                 type(code_point)))
コード例 #14
0
 def __getitem__(self, i):
     if i < 0 or i > len(self._code_points) - 1:
         raise SmartStrException(
             'Illegal index {} for the Smart Char subscript operator.'.
             format(i))
     return self._code_points[i]