def get_sourcestr_representation(self): """ DCharacterSAN.get_sourcestr_representation Return a string. """ # ....................................................................... # unknown char ? Nothing to do : # ....................................................................... if self.unknown_char: if self.dstring_object.options["anonymize the unknown characters"] == "yes": return UNKNOWN_CHAR_SYMBOL else: return self.base_char # ....................................................................... # ok, the function can analyse <self> : # ....................................................................... res = [] if self.base_char is not None: if self.punctuation: # punctuation symbol : res.append(SYMB_PUNCTUATION.get_default_symbol(self.base_char)) else: if self.base_char in SYMB_OTHER_SYMBOLS: # "other symbol" : not punctuation nor consonant nor # independent vowel. res.append(SYMB_OTHER_SYMBOLS.get_default_symbol(self.base_char)) elif not self.is_an_independent_vowel: # consonant : res.append(SYMB_CONSONANTS.get_default_symbol(self.base_char)) else: # independent vowel : res.append(SYMB_INDEPENDENT_VOWELS.get_default_symbol(self.base_char)) # dependent vowel ? if self.dependentvowel is not None: # yes : res.append(SYMB_DEPENDENT_VOWELS.get_default_symbol(self.dependentvowel)) if self.nukta: res.append(DEFAULTSYMB__NUKTA) if self.accent is not None: res.append(SYMB_DIACRITICS.get_default_symbol(self.accent)) if self.virama: res.append(DEFAULTSYMB__VIRAMA) if self.anudatta: res.append(DEFAULTSYMB__ANUDATTA) if self.anusvara_candrabindu is not None: res.append(SYMB_DIACRITICS.get_default_symbol(self.anusvara_candrabindu)) res = "".join(res) # we have to delete the fake symbol for 'a' since there's no symbol in devanagari for # the vowel 'a'. res = res.replace(FAKE_A__SYMBOL, "") # (1/3) composition with PRE_NORMALIZE_NFC : for src, dest in PRE_NORMALIZE_NFC: res = res.replace(src, dest) # (2/3) composition with unicodedata.normalize : res = unicodedata.normalize("NFC", res) # (3/3) composition with POST_NORMALIZE_NFC : for src, dest in POST_NORMALIZE_NFC: res = res.replace(src, dest) return res
def get_intstruct_from_trans_str( _src, dstring_object ): """ function get_intstruct_from_trans_str() _src : (str) transliterated string like "क". Return a ListOfInternalStructures object. """ # list of InternalStructure objects. istructs = ListOfInternalStructures(anonymize_the_unknown_chars =\ dstring_object.options["anonymize the unknown characters"] == 'yes') # we read <_src> through a DSTRING_SAN object : dstring_san = new_dstring(language='संस्कृतम्', transliteration_method="iso15919") dstring_san = dstring_san(_src) # In Sanskrit, if a consonant is followed by a virama, it means that the following # consonants are part of a cluster of consonants. # # E.g. in कर्म (0915=ka, 0930=ra, 094D=virama, 092E=ma) we have something like kar+ma, # the -m- having no vowel. # place_consonant_among_subjc = False for dchar_san in dstring_san: if dchar_san.unknown_char: new_istruct = InternalStructure( dstring_object = dstring_object, unknown_character = True ) istructs.append(new_istruct) else: # punctation symbol : if dchar_san.base_char in SAN__SYMB_PUNCTUATION: unicode_symb = SAN__SYMB_PUNCTUATION.get_default_symbol(dchar_san.base_char) new_istruct = InternalStructure( dstring_object = dstring_object, punctuation_or_other_symbol = \ PUNCTUATION_INVERSED[unicode_symb] ) istructs.append(new_istruct) place_consonant_among_subjc = False # other symbol : elif dchar_san.base_char in SAN__SYMB_OTHER_SYMBOLS: unicode_symb = SAN__SYMB_OTHER_SYMBOLS.get_default_symbol(dchar_san.base_char) new_istruct = InternalStructure( dstring_object = dstring_object, punctuation_or_other_symbol = \ OTHER_SYMBOLS_INVERSED[unicode_symb] ) istructs.append(new_istruct) place_consonant_among_subjc = False # independent vowel: elif dchar_san.base_char in SAN__SYMB_INDEPENDENT_VOWELS: #............................................................... # _independent_vowel will be added as an independent vowel : #............................................................... if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.base_char=='O': #==================== # @@BOD2SAN-NORM-004 # (independent vowel) ओ(ō) > औ(au) #==================== _independent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char=='O': #==================== # @@BOD2SAN-LOW-004 # (independent vowel) ओ(ō) > औ(au) #==================== _independent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char in ('AA', 'II', 'UU'): #==================== # @@BOD2SAN-LOW-006 # (independent vowel) long vowels > short vowels #==================== _independent_vowel = {'AA' : 'A', 'II' : 'I', 'UU' : 'U'}[dchar_san.base_char] else: _independent_vowel = dchar_san.base_char unicode_symb = SAN__SYMB_INDEPENDENT_VOWELS.get_default_symbol(_independent_vowel) new_istruct = InternalStructure( dstring_object = dstring_object, consonant = "A", vowel1 = INDEPENDENT_VOWELS_INVERSED[unicode_symb]) istructs.append(new_istruct) place_consonant_among_subjc = False # consonant : elif dchar_san.base_char in SAN__SYMB_CONSONANTS: if dchar_san.base_char == 'DEVANAGARI SIGN VISARGA': # special case : the visarga symbol is placed among consonants in Sanskrit, # among diacritics in Tibetan. if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.base_char=='DEVANAGARI SIGN VISARGA': #==================== # @@BOD2SAN-NORM-001 # the visarga is omitted if "san2bod quality" == "normal" #==================== pass elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char=='DEVANAGARI SIGN VISARGA': #=================== # @@BOD2SAN-LOW-001 # the visarga is omitted if "san2bod quality" == "low" #=================== pass else: unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char) istructs[-1].rnam_bcad = True place_consonant_among_subjc = False elif not place_consonant_among_subjc: # consonant to be placed as a main consonant # (and not among subjoined consonants) : #........................................................... # _base_char will be added as a main consonant : #........................................................... if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.base_char=='VA': #==================== # @@BOD2SAN-NORM-002 # the व(va) becomes ब(ba) if "san2bod quality" == "normal" #==================== _base_char = "BA" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char=='VA': #=================== # @@BOD2SAN-LOW-002 # the व(va) becomes ब(ba) if "san2bod quality" == "normal" #=================== _base_char = "BA" elif (dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char in ('TTA', 'TTHA', 'DDA', 'DDHA', 'NNA')): #=================== # @@BOD2SAN-LOW-007 # retroflex consonant > non-retroflex consonant # retroflex consonant + aspiration > non-retroflex # consonant without aspiration #=================== _base_char = {'TTA' : "TA", 'TTHA' : "TA", 'DDA' : "DA", 'DDHA' : "DA", 'NNA' : "NA" }[dchar_san.base_char] elif (dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char in ('KHA', 'GHA', 'THA', 'CHA', 'JHA', 'TTHA', 'DDHA', 'PHA', 'BHA')): #=================== # @@BOD2SAN-LOW-008 # consonant + aspiration > consonant without aspiration #=================== _base_char = {'KHA' : "KA", 'GHA' : "GA", 'THA' : "TA", 'CHA' : "CA", 'JHA' : "JA", 'DHA' : "DA", 'TTHA' : "TTA", 'DDHA' : "DDA", 'PHA' : "PA", 'BHA' : "BA" }[dchar_san.base_char] else: # general case : _base_char = dchar_san.base_char unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(_base_char) bod_consonant = CONSONANTS_INVERSED[unicode_symb] new_istruct = InternalStructure( dstring_object = dstring_object, consonant = bod_consonant ) istructs.append(new_istruct) if dchar_san.virama: place_consonant_among_subjc = True else: # consonant to be placed among subjoined consonants # (and not as a main consonant) : if istructs[-1].subfix is None: istructs[-1].subfix = [] unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char) cons = CONSONANTS_INVERSED[unicode_symb] add_this_consonant = True if dstring_object.options["san2bod quality"] == "low" and \ istructs[-1].subfix == [] and \ istructs[-1].consonant == cons: #=================== # @@BOD2SAN-LOW-008 # geminate consonant > 0 #=================== add_this_consonant = False # no more subjoinded consonant : the other one will be treated # like main consonants : place_consonant_among_subjc = False if add_this_consonant: istructs[-1].subfix.append( cons ) if not dchar_san.virama: place_consonant_among_subjc = False # dependent vowel : if dchar_san.dependentvowel is not None and dchar_san.dependentvowel != "A": #........................................................... # _dependent_vowel will be added as a dependent vowel : #........................................................... if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.dependentvowel=='O': #==================== # @@BOD2SAN-NORM-003 # (dependent vowel) ओ(ō) > औ(au) #==================== _dependent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.dependentvowel=='O': #==================== # @@BOD2SAN-LOW-003 # (dependent vowel) ओ(ō) > औ(au) #==================== _dependent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.dependentvowel in ('AA', 'II', 'UU'): #==================== # @@BOD2SAN-LOW-005 # (dependent vowel) long vowels > short vowels #==================== _dependent_vowel = {'AA' : 'A', 'II' : 'I', 'UU' : 'U'}[dchar_san.dependentvowel] else: _dependent_vowel = dchar_san.dependentvowel unicode_symb = \ SAN__SYMB_DEPENDENT_VOWELS.get_default_symbol(_dependent_vowel) istructs[-1].vowel1 = DEPENDENT_VOWELS_INVERSED[unicode_symb] # anusvara/candrabindu : if dchar_san.anusvara_candrabindu is not None: unicode_symb = \ SAN__SYMB_DIACRITICS.get_default_symbol(dchar_san.anusvara_candrabindu) istructs[-1].anusvara_candrabindu = DIACRITICS_INVERSED[unicode_symb] res = ListOfInternalStructures(anonymize_the_unknown_chars =\ dstring_object.options["anonymize the unknown characters"] == 'yes') # we add a tsheg after a "real" syllable (id est, not a punctuation sign, ...) for istruct in istructs: res.append(istruct) if istruct.consonant is not None: res.append( InternalStructure( dstring_object = dstring_object, punctuation_or_other_symbol = 'MARK INTERSYLLABIC TSHEG' )) return res
def init_from_str(self, str_src): """ DStringSAN.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (itrans symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringSAN.pattern) give the symbols{base_char, diacritics} * (3.1) virama * (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel * (3.3) accent * (3.4) nukta * (3.5) anusvara_candrabindu * (3.6) anudatta * (3.7) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (itrans symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_CONSONANTS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_INDEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringSAN.pattern) give the symbols{basechar, diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringSAN.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() base_char = data['basechar'] dependentvowel = data['dependentvowel'] diacritics = data['diacritics'] # base_char as "क" becomes "KA" base_char__punctuation = SYMB_PUNCTUATION.get_the_name_for_this_symbol(base_char) base_char__other_symbols = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(base_char) base_char__consonant = SYMB_CONSONANTS.get_the_name_for_this_symbol(base_char) base_char__ivowel = SYMB_INDEPENDENT_VOWELS.get_the_name_for_this_symbol(base_char) base_char__dvowel = SYMB_DEPENDENT_VOWELS.get_the_name_for_this_symbol(dependentvowel) is_an_independent_vowel = False # <is_an_independent_vowel> is set here since, # if base_char is a punctuation symbol, # it will never be set again but it is needed by # the call to new_character = DCharacterSAN(...) virama = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) virama #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . virama_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__VIRAMA) if virama_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'virama' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) virama = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN VIRAMA', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if base_char__punctuation is not None: # punctuation symbol : punctuation = True base_char = base_char__punctuation elif base_char__other_symbols is not None: # "other symbol" : not punctuation nor consonant nor independent vowel : punctuation = False base_char = base_char__other_symbols else: punctuation = False if base_char__consonant is not None: # consonant : is_an_independent_vowel = False base_char = base_char__consonant # dependent vowel ? if base_char != 'DEVANAGARI SIGN VISARGA' and \ not virama and dependentvowel is None: # special case : for normal consonants (and visarga is a pseudo-consonant) # written without any vowel symbol, the dependent vowel # is 'A'. E.g. 'क' stands for 'ka', not for 'k'. dependentvowel = "A" else: dependentvowel = base_char__dvowel else: # independent vowel : is_an_independent_vowel = True dependentvowel = None base_char = base_char__ivowel accent = None nukta = False anusvara_candrabindu = None anudatta = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) accent #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . accent_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__ACCENTS ) if accent_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'accent' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) accent = None for accent_char in SYMB_DIACRITICS__ACCENTS: accent_name = SYMB_DIACRITICS.defaultsymbol2name[accent_char] if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=accent_name, string=diacritics): accent = accent_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) nukta #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . nukta_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__NUKTA ) if nukta_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'nukta' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) nukta = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN NUKTA', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) anusvara_candrabindu #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . anusvara_candrabindu_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__ANUSVARA_CANDRABINDU) if anusvara_candrabindu_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), " \ "'anusvara_candrabindu' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) anusvara_candrabindu = None for anusvara_candrabindu_char in SYMB_DIACRITICS__ANUSVARA_CANDRABINDU: anusvara_candrabindu_name = SYMB_DIACRITICS.defaultsymbol2name[ anusvara_candrabindu_char] if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=anusvara_candrabindu_name, string=diacritics): anusvara_candrabindu = anusvara_candrabindu_name break #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.6) anudatta #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . anudatta_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__ANUDATTA) if anudatta_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), 'anudatta' defined several times." raise DCharsError( context = "DStringSAN.init_from_str", message = err_msg.format(element.string, element.start(), element.end()),) anudatta = SYMB_DIACRITICS.are_these_symbols_in_a_string( 'DEVANAGARI STRESS SIGN ANUDATTA', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.7) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterSAN(dstring_object = self, unknown_char = False, base_char = base_char, accent = accent, punctuation = punctuation, nukta = nukta, anusvara_candrabindu = anusvara_candrabindu, virama = virama, anudatta = anudatta, is_an_independent_vowel = is_an_independent_vowel, dependentvowel = dependentvowel) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterSAN(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )