def init_from_str(self, str_src): """ DStringSAN.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (itrans symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringSAN.pattern) give the symbols{base_char, diacritics} * (3.1) virama * (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel * (3.3) accent * (3.4) nukta * (3.5) anusvara_candrabindu * (3.6) anudatta * (3.7) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (itrans symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_CONSONANTS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_INDEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringSAN.pattern) give the symbols{basechar, diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringSAN.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() base_char = data['basechar'] dependentvowel = data['dependentvowel'] diacritics = data['diacritics'] # base_char as "क" becomes "KA" base_char__punctuation = SYMB_PUNCTUATION.get_the_name_for_this_symbol(base_char) base_char__other_symbols = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(base_char) base_char__consonant = SYMB_CONSONANTS.get_the_name_for_this_symbol(base_char) base_char__ivowel = SYMB_INDEPENDENT_VOWELS.get_the_name_for_this_symbol(base_char) base_char__dvowel = SYMB_DEPENDENT_VOWELS.get_the_name_for_this_symbol(dependentvowel) is_an_independent_vowel = False # <is_an_independent_vowel> is set here since, # if base_char is a punctuation symbol, # it will never be set again but it is needed by # the call to new_character = DCharacterSAN(...) virama = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) virama #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . virama_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__VIRAMA) if virama_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'virama' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) virama = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN VIRAMA', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if base_char__punctuation is not None: # punctuation symbol : punctuation = True base_char = base_char__punctuation elif base_char__other_symbols is not None: # "other symbol" : not punctuation nor consonant nor independent vowel : punctuation = False base_char = base_char__other_symbols else: punctuation = False if base_char__consonant is not None: # consonant : is_an_independent_vowel = False base_char = base_char__consonant # dependent vowel ? if base_char != 'DEVANAGARI SIGN VISARGA' and \ not virama and dependentvowel is None: # special case : for normal consonants (and visarga is a pseudo-consonant) # written without any vowel symbol, the dependent vowel # is 'A'. E.g. 'क' stands for 'ka', not for 'k'. dependentvowel = "A" else: dependentvowel = base_char__dvowel else: # independent vowel : is_an_independent_vowel = True dependentvowel = None base_char = base_char__ivowel accent = None nukta = False anusvara_candrabindu = None anudatta = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) accent #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . accent_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__ACCENTS ) if accent_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'accent' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) accent = None for accent_char in SYMB_DIACRITICS__ACCENTS: accent_name = SYMB_DIACRITICS.defaultsymbol2name[accent_char] if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=accent_name, string=diacritics): accent = accent_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) nukta #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . nukta_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__NUKTA ) if nukta_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'nukta' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) nukta = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN NUKTA', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) anusvara_candrabindu #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . anusvara_candrabindu_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__ANUSVARA_CANDRABINDU) if anusvara_candrabindu_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), " \ "'anusvara_candrabindu' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) anusvara_candrabindu = None for anusvara_candrabindu_char in SYMB_DIACRITICS__ANUSVARA_CANDRABINDU: anusvara_candrabindu_name = SYMB_DIACRITICS.defaultsymbol2name[ anusvara_candrabindu_char] if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=anusvara_candrabindu_name, string=diacritics): anusvara_candrabindu = anusvara_candrabindu_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.6) anudatta #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . anudatta_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__ANUDATTA) if anudatta_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'anudatta' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) anudatta = SYMB_DIACRITICS.are_these_symbols_in_a_string( 'DEVANAGARI STRESS SIGN ANUDATTA', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.7) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterSAN(dstring_object = self, unknown_char = False, base_char = base_char, accent = accent, punctuation = punctuation, nukta = nukta, anusvara_candrabindu = anusvara_candrabindu, virama = virama, anudatta = anudatta, is_an_independent_vowel = is_an_independent_vowel, dependentvowel = dependentvowel) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )