def get_intstruct_from_trans_str( _src, dstring_object ): """ function get_intstruct_from_trans_str() _src : (str) transliterated string like "क". Return a ListOfInternalStructures object. """ # list of InternalStructure objects. istructs = ListOfInternalStructures(anonymize_the_unknown_chars =\ dstring_object.options["anonymize the unknown characters"] == 'yes') # we read <_src> through a DSTRING_SAN object : dstring_san = new_dstring(language='संस्कृतम्', transliteration_method="iso15919") dstring_san = dstring_san(_src) # In Sanskrit, if a consonant is followed by a virama, it means that the following # consonants are part of a cluster of consonants. # # E.g. in कर्म (0915=ka, 0930=ra, 094D=virama, 092E=ma) we have something like kar+ma, # the -m- having no vowel. # place_consonant_among_subjc = False for dchar_san in dstring_san: if dchar_san.unknown_char: new_istruct = InternalStructure( dstring_object = dstring_object, unknown_character = True ) istructs.append(new_istruct) else: # punctation symbol : if dchar_san.base_char in SAN__SYMB_PUNCTUATION: unicode_symb = SAN__SYMB_PUNCTUATION.get_default_symbol(dchar_san.base_char) new_istruct = InternalStructure( dstring_object = dstring_object, punctuation_or_other_symbol = \ PUNCTUATION_INVERSED[unicode_symb] ) istructs.append(new_istruct) place_consonant_among_subjc = False # other symbol : elif dchar_san.base_char in SAN__SYMB_OTHER_SYMBOLS: unicode_symb = SAN__SYMB_OTHER_SYMBOLS.get_default_symbol(dchar_san.base_char) new_istruct = InternalStructure( dstring_object = dstring_object, punctuation_or_other_symbol = \ OTHER_SYMBOLS_INVERSED[unicode_symb] ) istructs.append(new_istruct) place_consonant_among_subjc = False # independent vowel: elif dchar_san.base_char in SAN__SYMB_INDEPENDENT_VOWELS: #............................................................... # _independent_vowel will be added as an independent vowel : #............................................................... if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.base_char=='O': #==================== # @@BOD2SAN-NORM-004 # (independent vowel) ओ(ō) > औ(au) #==================== _independent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char=='O': #==================== # @@BOD2SAN-LOW-004 # (independent vowel) ओ(ō) > औ(au) #==================== _independent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char in ('AA', 'II', 'UU'): #==================== # @@BOD2SAN-LOW-006 # (independent vowel) long vowels > short vowels #==================== _independent_vowel = {'AA' : 'A', 'II' : 'I', 'UU' : 'U'}[dchar_san.base_char] else: _independent_vowel = dchar_san.base_char unicode_symb = SAN__SYMB_INDEPENDENT_VOWELS.get_default_symbol(_independent_vowel) new_istruct = InternalStructure( dstring_object = dstring_object, consonant = "A", vowel1 = INDEPENDENT_VOWELS_INVERSED[unicode_symb]) istructs.append(new_istruct) place_consonant_among_subjc = False # consonant : elif dchar_san.base_char in SAN__SYMB_CONSONANTS: if dchar_san.base_char == 'DEVANAGARI SIGN VISARGA': # special case : the visarga symbol is placed among consonants in Sanskrit, # among diacritics in Tibetan. if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.base_char=='DEVANAGARI SIGN VISARGA': #==================== # @@BOD2SAN-NORM-001 # the visarga is omitted if "san2bod quality" == "normal" #==================== pass elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char=='DEVANAGARI SIGN VISARGA': #=================== # @@BOD2SAN-LOW-001 # the visarga is omitted if "san2bod quality" == "low" #=================== pass else: unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char) istructs[-1].rnam_bcad = True place_consonant_among_subjc = False elif not place_consonant_among_subjc: # consonant to be placed as a main consonant # (and not among subjoined consonants) : #........................................................... # _base_char will be added as a main consonant : #........................................................... if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.base_char=='VA': #==================== # @@BOD2SAN-NORM-002 # the व(va) becomes ब(ba) if "san2bod quality" == "normal" #==================== _base_char = "BA" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char=='VA': #=================== # @@BOD2SAN-LOW-002 # the व(va) becomes ब(ba) if "san2bod quality" == "normal" #=================== _base_char = "BA" elif (dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char in ('TTA', 'TTHA', 'DDA', 'DDHA', 'NNA')): #=================== # @@BOD2SAN-LOW-007 # retroflex consonant > non-retroflex consonant # retroflex consonant + aspiration > non-retroflex # consonant without aspiration #=================== _base_char = {'TTA' : "TA", 'TTHA' : "TA", 'DDA' : "DA", 'DDHA' : "DA", 'NNA' : "NA" }[dchar_san.base_char] elif (dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char in ('KHA', 'GHA', 'THA', 'CHA', 'JHA', 'TTHA', 'DDHA', 'PHA', 'BHA')): #=================== # @@BOD2SAN-LOW-008 # consonant + aspiration > consonant without aspiration #=================== _base_char = {'KHA' : "KA", 'GHA' : "GA", 'THA' : "TA", 'CHA' : "CA", 'JHA' : "JA", 'DHA' : "DA", 'TTHA' : "TTA", 'DDHA' : "DDA", 'PHA' : "PA", 'BHA' : "BA" }[dchar_san.base_char] else: # general case : _base_char = dchar_san.base_char unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(_base_char) bod_consonant = CONSONANTS_INVERSED[unicode_symb] new_istruct = InternalStructure( dstring_object = dstring_object, consonant = bod_consonant ) istructs.append(new_istruct) if dchar_san.virama: place_consonant_among_subjc = True else: # consonant to be placed among subjoined consonants # (and not as a main consonant) : if istructs[-1].subfix is None: istructs[-1].subfix = [] unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char) cons = CONSONANTS_INVERSED[unicode_symb] add_this_consonant = True if dstring_object.options["san2bod quality"] == "low" and \ istructs[-1].subfix == [] and \ istructs[-1].consonant == cons: #=================== # @@BOD2SAN-LOW-008 # geminate consonant > 0 #=================== add_this_consonant = False # no more subjoinded consonant : the other one will be treated # like main consonants : place_consonant_among_subjc = False if add_this_consonant: istructs[-1].subfix.append( cons ) if not dchar_san.virama: place_consonant_among_subjc = False # dependent vowel : if dchar_san.dependentvowel is not None and dchar_san.dependentvowel != "A": #........................................................... # _dependent_vowel will be added as a dependent vowel : #........................................................... if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.dependentvowel=='O': #==================== # @@BOD2SAN-NORM-003 # (dependent vowel) ओ(ō) > औ(au) #==================== _dependent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.dependentvowel=='O': #==================== # @@BOD2SAN-LOW-003 # (dependent vowel) ओ(ō) > औ(au) #==================== _dependent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.dependentvowel in ('AA', 'II', 'UU'): #==================== # @@BOD2SAN-LOW-005 # (dependent vowel) long vowels > short vowels #==================== _dependent_vowel = {'AA' : 'A', 'II' : 'I', 'UU' : 'U'}[dchar_san.dependentvowel] else: _dependent_vowel = dchar_san.dependentvowel unicode_symb = \ SAN__SYMB_DEPENDENT_VOWELS.get_default_symbol(_dependent_vowel) istructs[-1].vowel1 = DEPENDENT_VOWELS_INVERSED[unicode_symb] # anusvara/candrabindu : if dchar_san.anusvara_candrabindu is not None: unicode_symb = \ SAN__SYMB_DIACRITICS.get_default_symbol(dchar_san.anusvara_candrabindu) istructs[-1].anusvara_candrabindu = DIACRITICS_INVERSED[unicode_symb] res = ListOfInternalStructures(anonymize_the_unknown_chars =\ dstring_object.options["anonymize the unknown characters"] == 'yes') # we add a tsheg after a "real" syllable (id est, not a punctuation sign, ...) for istruct in istructs: res.append(istruct) if istruct.consonant is not None: res.append( InternalStructure( dstring_object = dstring_object, punctuation_or_other_symbol = 'MARK INTERSYLLABIC TSHEG' )) return res
def get_sourcestr_representation(self): """ DCharacterSAN.get_sourcestr_representation Return a string. """ # ....................................................................... # unknown char ? Nothing to do : # ....................................................................... if self.unknown_char: if self.dstring_object.options["anonymize the unknown characters"] == "yes": return UNKNOWN_CHAR_SYMBOL else: return self.base_char # ....................................................................... # ok, the function can analyse <self> : # ....................................................................... res = [] if self.base_char is not None: if self.punctuation: # punctuation symbol : res.append(SYMB_PUNCTUATION.get_default_symbol(self.base_char)) else: if self.base_char in SYMB_OTHER_SYMBOLS: # "other symbol" : not punctuation nor consonant nor # independent vowel. res.append(SYMB_OTHER_SYMBOLS.get_default_symbol(self.base_char)) elif not self.is_an_independent_vowel: # consonant : res.append(SYMB_CONSONANTS.get_default_symbol(self.base_char)) else: # independent vowel : res.append(SYMB_INDEPENDENT_VOWELS.get_default_symbol(self.base_char)) # dependent vowel ? if self.dependentvowel is not None: # yes : res.append(SYMB_DEPENDENT_VOWELS.get_default_symbol(self.dependentvowel)) if self.nukta: res.append(DEFAULTSYMB__NUKTA) if self.accent is not None: res.append(SYMB_DIACRITICS.get_default_symbol(self.accent)) if self.virama: res.append(DEFAULTSYMB__VIRAMA) if self.anudatta: res.append(DEFAULTSYMB__ANUDATTA) if self.anusvara_candrabindu is not None: res.append(SYMB_DIACRITICS.get_default_symbol(self.anusvara_candrabindu)) res = "".join(res) # we have to delete the fake symbol for 'a' since there's no symbol in devanagari for # the vowel 'a'. res = res.replace(FAKE_A__SYMBOL, "") # (1/3) composition with PRE_NORMALIZE_NFC : for src, dest in PRE_NORMALIZE_NFC: res = res.replace(src, dest) # (2/3) composition with unicodedata.normalize : res = unicodedata.normalize("NFC", res) # (3/3) composition with POST_NORMALIZE_NFC : for src, dest in POST_NORMALIZE_NFC: res = res.replace(src, dest) return res