def get_sourcestr_representation(self): """ DCharacterLAT.get_sourcestr_representation Return a string. """ #....................................................................... # unknown char ? Nothing to do : #....................................................................... if self.unknown_char: if self.dstring_object.options["anonymize the unknown characters"] == 'yes': return UNKNOWN_CHAR_SYMBOL else: return self.base_char #....................................................................... # ok, the function can analyse <self> : #....................................................................... res = [] if self.base_char is not None: if self.punctuation: # punctuation symbol : res.append( self.base_char ) elif not self.capital_letter: # lower case : res.append( SYMB_LOWER_CASE.get_default_symbol(self.base_char) ) else: # upper case : res.append( SYMB_UPPER_CASE.get_default_symbol(self.base_char) ) if self.stress: res.append( DEFAULTSYMB__STRESS ) if self.length == 'short' or self.length == 'long': res.append( SYMB_DIACRITICS.get_default_symbol(self.length) ) if self.diaeresis: res.append( DEFAULTSYMB__DIAERESIS ) res = "".join(res) # (1/2) composition with unicodedata.normalize : res = unicodedata.normalize('NFC', res) # (2/2) composition with COMPLETE_NORMALIZE_NFC : for src, dest in COMPLETE_NORMALIZE_NFC: res = res.replace(src, dest) return res
def init_from_str(self, str_src): """ DStringLAT.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringLAT.pattern) give the symbols{letter+diacritics} * (3.1) base_char * (3.2) length * (3.3) stress * (3.4) diaeresis * (3.5) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringLAT.pattern) give the symbols{letter+diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringLAT.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterLAT(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterLAT(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() letter = data['letter'] diacritics = data['diacritics'] punctuation = letter in SYMB_PUNCTUATION.symbol2name capital_letter = letter in SYMB_UPPER_CASE.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) base_char #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if punctuation: # punctuation symbol : base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter) elif not capital_letter: # lower case : base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter) else: # upper case : base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter) length = None stress = False diaeresis = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) length #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . length_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__LENGTH ) if length_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), length defined several times." raise DCharsError( context = "DStringLAT.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) length = None if SYMB_DIACRITICS.are_these_symbols_in_a_string('short', diacritics): length = "short" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('long', diacritics): length = "long" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) stress #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . stress_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__STRESS) if stress_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), stress defined several times." raise DCharsError( context = "DStringLAT.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) stress = SYMB_DIACRITICS.are_these_symbols_in_a_string("stress", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) diaeresis #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . diaeresis_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__DIAERESIS) if diaeresis_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), diaeresis defined several times." raise DCharsError( context = "DStringLAT.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) diaeresis = SYMB_DIACRITICS.are_these_symbols_in_a_string("diaeresis", diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterLAT(dstring_object = self, unknown_char = False, base_char = base_char, punctuation = punctuation, capital_letter = capital_letter, length = length, stress = stress, diaeresis = diaeresis) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterLAT(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterLAT(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )