def _validate_utf_8_code_point_consistency(self, code_point_start_byte_index, code_point_bytes_count): # Make sure that the UTF-8 str has sufficient length to accommodate all the bytes of the # code point. if code_point_start_byte_index + code_point_bytes_count > len( self._raw_utf_8_sequence): raise SmartStrException( 'UTF-8 str format error. Code point at index {} is of size {}, but the total length of the str is {}.' .format(code_point_start_byte_index, code_point_bytes_count, len(self._raw_utf_8_sequence))) # Validate the format of the trailing bytes of the code point. for i in range(code_point_start_byte_index + 1, code_point_start_byte_index + code_point_bytes_count): current_trailing_byte = self._raw_utf_8_sequence[i] current_trailing_byte_ordinal = ord(current_trailing_byte) # Make sure that the trailing bytes of the code point have the correct format # (10000000 - 10111111). if current_trailing_byte_ordinal < 0x80 or current_trailing_byte_ordinal > 0xBF: raise SmartStrException( 'UTF-8 str format error. Code point trailing byte has a faulty format (byte at index {} of size {}, offset {}).' .format(code_point_start_byte_index, code_point_bytes_count, i - code_point_start_byte_index))
def _validate_utf_16_code_point_consistency(self, code_point_start_word_index, code_point_words_count): # Make sure that the UTF-16 str has sufficient length to accommodate all the words of the # code point. if code_point_start_word_index + code_point_words_count > len( self._raw_utf_16_sequence): raise SmartStrException( 'UTF-16 str format error. Code point at index {} is of size {}, but the total length of the str is {}.' .format(code_point_start_word_index, code_point_words_count, len(self._raw_utf_16_sequence))) # If the code point consists of 2 words, verify that those words are a surrogate pair # (high surrogate and low surrogate). if code_point_words_count == 2: # Verify that the first word is a high surrogate. first_code_point_word = self._raw_utf_16_sequence[ code_point_start_word_index] if not self._is_utf_16_high_surrogate(first_code_point_word): raise SmartStrException( 'UTF-16 str format error. Code point consists of 2 words, but the first word is not a high surrogate (word at index {}).' .format(code_point_start_word_index)) # Verify that the second word is a low surrogate. second_code_point_word = self._raw_utf_16_sequence[ code_point_start_word_index + 1] if not self._is_utf_16_low_surrogate(second_code_point_word): raise SmartStrException( 'UTF-16 str format error. Code point consists of 2 words, but the second word is not a low surrogate (word at index {}).' .format(code_point_start_word_index + 1))
def _validate_list_input(code_points): if len(code_points) > 2: raise SmartStrException( 'The list to initialize the Smart Char object is too long - {} objects.' .format(len(code_points))) for code_point in code_points: if not isinstance(code_point, CodePoint): raise SmartStrException( 'Illegal type {} of an object in the list used for the Smart Char initialization.' .format(type(code_point)))
def __init__(self, raw_input): if isinstance(raw_input, int): # The code point is initialized with a Unicode numeric value. self._init_raw_type = StrInitInputType.UNICODE_VAL self._raw_utf_8_sequence = None self._raw_utf_16_sequence = None self._unicode_val = self.get_validated_unicode_val(raw_input) elif isinstance(raw_input, str): # The code point is initialized with a UTF-8 sequence of bytes ('str'). self._init_raw_type = StrInitInputType.UTF_8 self._raw_utf_8_sequence = raw_input self._raw_utf_16_sequence = None self._unicode_val = self.get_utf_8_code_point_val(raw_input) elif isinstance(raw_input, unicode): # The code point is initialized with a UTF-16 sequence of words ('unicode'). self._init_raw_type = StrInitInputType.UTF_16 self._raw_utf_8_sequence = None self._raw_utf_16_sequence = raw_input self._unicode_val = self.get_utf_16_code_point_val(raw_input) else: raise SmartStrException( 'Illegal Code Point raw input type {}.'.format( type(raw_input))) self._code_point_type = self.get_code_point_type(self._unicode_val)
def get_validated_unicode_val(unicode_val): if unicode_val >= 0x0000 and unicode_val <= 0x10FFFF: return unicode_val else: raise SmartStrException( 'Int value {} is not a valid value for a Unicode code point'. format(unicode_val))
def _process_rew_utf_8_str(self): current_byte_idx = 0 previous_code_point = None while current_byte_idx < len(self._raw_utf_8_sequence): current_byte = self._raw_utf_8_sequence[current_byte_idx] current_byte_ordinal = ord(current_byte) if current_byte_ordinal < 0x80: # The current byte is in the range 00000000 - 01111111. # The actual code point is an ascii char that occupies 1 byte. current_code_point_bytes_count = 1 self._validate_utf_8_code_point_consistency( current_byte_idx, current_code_point_bytes_count) elif current_byte_ordinal < 0xE0: # The current byte is in the range 11000000 - 11011111. current_code_point_bytes_count = 2 self._validate_utf_8_code_point_consistency( current_byte_idx, current_code_point_bytes_count) elif current_byte_ordinal < 0xF0: # The current byte is in the range 11100000 - 11101111. current_code_point_bytes_count = 3 self._validate_utf_8_code_point_consistency( current_byte_idx, current_code_point_bytes_count) elif current_byte_ordinal < 0xF8: # The current byte is in the range 11110000 - 11110111. current_code_point_bytes_count = 4 self._validate_utf_8_code_point_consistency( current_byte_idx, current_code_point_bytes_count) else: raise SmartStrException( 'UTF-8 str format error. Illegal format for the starting byte of a code point.' ) current_code_point_sequence = self._raw_utf_8_sequence[ current_byte_idx:current_byte_idx + current_code_point_bytes_count] current_code_point = CodePoint(current_code_point_sequence) current_byte_idx += current_code_point_bytes_count last_code_point_in_str = current_byte_idx == len( self._raw_utf_8_sequence) previous_code_point = self._convert_code_points_to_char( previous_code_point, current_code_point, last_code_point_in_str)
def get_utf_16_code_point_val(utf_16_raw_sequence): utf_16_raw_sequence_len = len(utf_16_raw_sequence) if utf_16_raw_sequence_len == 1: code_point_val = ord(utf_16_raw_sequence) elif utf_16_raw_sequence_len == 2: high_surrogate = utf_16_raw_sequence[0] low_surrogate = utf_16_raw_sequence[1] code_point_val = (0x10000 + ((ord(high_surrogate) - 0xD800) * 0x400) + (ord(low_surrogate) - 0xDC00)) else: raise SmartStrException( 'UTF-16 sequence of length {} does not represent a valid code point.' .format(len(utf_16_raw_sequence))) return code_point_val
def __init__(self, raw_sequence): if isinstance(raw_sequence, str): self._init_raw_type = StrInitInputType.UTF_8 self._raw_utf_8_sequence = raw_sequence self._raw_utf_16_sequence = None self._characters = [] self._process_rew_utf_8_str() elif isinstance(raw_sequence, unicode): self._init_raw_type = StrInitInputType.UTF_16 self._raw_utf_8_sequence = None self._raw_utf_16_sequence = raw_sequence self._characters = [] self._process_rew_utf_16_str() else: raise SmartStrException( 'Illegal Smart String raw sequence type {}.'.format( type(raw_sequence)))
def get_utf_16_code_point_sequence(code_point_unicode_val): utf_16_code_point_sequence = u'' if code_point_unicode_val >= 0x0000 and code_point_unicode_val <= 0xFFFF: utf_16_code_point_sequence = unichr(code_point_unicode_val) elif code_point_unicode_val >= 0x10000 and code_point_unicode_val <= 0x10FFFF: high_surrogate = ( (code_point_unicode_val - 0x10000) / 0x400) + 0xD800 low_surrogate = ( (code_point_unicode_val - 0x10000) % 0x400) + 0xDC00 utf_16_code_point_sequence += unichr(high_surrogate) utf_16_code_point_sequence += unichr(low_surrogate) else: raise SmartStrException( 'Unicode code point value {} is illegal.'.format( code_point_unicode_val)) return utf_16_code_point_sequence
def get_utf_8_code_point_val(utf_8_raw_sequence): utf_8_raw_sequence_len = len(utf_8_raw_sequence) if utf_8_raw_sequence_len == 1: byte_0_numeric_val = ord(utf_8_raw_sequence[0]) byte_0_effective_val = byte_0_numeric_val & 0b01111111 code_point_val = byte_0_effective_val elif utf_8_raw_sequence_len == 2: byte_0_numeric_val = ord(utf_8_raw_sequence[0]) byte_1_numeric_val = ord(utf_8_raw_sequence[1]) byte_0_effective_val = byte_0_numeric_val & 0b00011111 byte_1_effective_val = byte_1_numeric_val & 0b00111111 code_point_val = (byte_0_effective_val << 6) + byte_1_effective_val elif utf_8_raw_sequence_len == 3: byte_0_numeric_val = ord(utf_8_raw_sequence[0]) byte_1_numeric_val = ord(utf_8_raw_sequence[1]) byte_2_numeric_val = ord(utf_8_raw_sequence[2]) byte_0_effective_val = byte_0_numeric_val & 0b00001111 byte_1_effective_val = byte_1_numeric_val & 0b00111111 byte_2_effective_val = byte_2_numeric_val & 0b00111111 code_point_val = ((byte_0_effective_val << 12) + (byte_1_effective_val << 6) + byte_2_effective_val) elif utf_8_raw_sequence_len == 4: byte_0_numeric_val = ord(utf_8_raw_sequence[0]) byte_1_numeric_val = ord(utf_8_raw_sequence[1]) byte_2_numeric_val = ord(utf_8_raw_sequence[2]) byte_3_numeric_val = ord(utf_8_raw_sequence[3]) byte_0_effective_val = byte_0_numeric_val & 0b00000111 byte_1_effective_val = byte_1_numeric_val & 0b00111111 byte_2_effective_val = byte_2_numeric_val & 0b00111111 byte_3_effective_val = byte_3_numeric_val & 0b00111111 code_point_val = ((byte_0_effective_val << 18) + (byte_1_effective_val << 12) + (byte_2_effective_val << 6) + byte_3_effective_val) else: raise SmartStrException( 'UTF-8 sequence of length {} does not represent a valid code point.' .format(utf_8_raw_sequence_len)) return code_point_val
def get_utf_8_code_point_sequence(code_point_unicode_val): utf_8_code_point_sequence = '' if code_point_unicode_val >= 0x0000 and code_point_unicode_val <= 0x007F: byte_0_numeric_val = code_point_unicode_val utf_8_code_point_sequence = chr(byte_0_numeric_val) elif code_point_unicode_val >= 0x0080 and code_point_unicode_val <= 0x07FF: byte_0_numeric_val = 0b11000000 | (code_point_unicode_val >> 6) byte_1_numeric_val = 0b10000000 | (code_point_unicode_val & 0b00111111) utf_8_code_point_sequence += chr(byte_0_numeric_val) utf_8_code_point_sequence += chr(byte_1_numeric_val) elif code_point_unicode_val >= 0x0800 and code_point_unicode_val <= 0xFFFF: byte_0_numeric_val = 0b11100000 | (code_point_unicode_val >> 12) byte_1_numeric_val = 0b10000000 | ( (code_point_unicode_val >> 6) & 0b00111111) byte_2_numeric_val = 0b10000000 | (code_point_unicode_val & 0b00111111) utf_8_code_point_sequence += chr(byte_0_numeric_val) utf_8_code_point_sequence += chr(byte_1_numeric_val) utf_8_code_point_sequence += chr(byte_2_numeric_val) elif code_point_unicode_val >= 0x10000 and code_point_unicode_val <= 0x10FFFF: byte_0_numeric_val = 0b11110000 | (code_point_unicode_val >> 18) byte_1_numeric_val = 0b10000000 | ( (code_point_unicode_val >> 12) & 0b00111111) byte_2_numeric_val = 0b10000000 | ( (code_point_unicode_val >> 6) & 0b00111111) byte_3_numeric_val = 0b10000000 | (code_point_unicode_val & 0b00111111) utf_8_code_point_sequence += chr(byte_0_numeric_val) utf_8_code_point_sequence += chr(byte_1_numeric_val) utf_8_code_point_sequence += chr(byte_2_numeric_val) utf_8_code_point_sequence += chr(byte_3_numeric_val) else: raise SmartStrException( 'Unicode code point value {} is illegal.'.format( code_point_unicode_val)) return utf_8_code_point_sequence
def __getitem__(self, i): if i < 0 or i > len(self._characters) - 1: raise SmartStrException( 'Illegal index {} for the Smart String subscript oparator.'. format(i)) return self.characters[i]
def _validate_non_list_input(code_point): if not isinstance(code_point, CodePoint): raise SmartStrException( 'Illegal type {} for the Smart Char initialization.'.format( type(code_point)))
def __getitem__(self, i): if i < 0 or i > len(self._code_points) - 1: raise SmartStrException( 'Illegal index {} for the Smart Char subscript operator.'. format(i)) return self._code_points[i]