class Payyans(): def __init__(self): self.input_filename = "" self.output_filename = "" self.mapping_filename = "" self.rulesDict = None self.pdf = 0 self.normalizer = Normalizer() def Unicode2ASCII(self, unicode_text, font): unicode_text = self.normalizer.normalize(unicode_text) index = 0 ascii_text = "" self.direction = "u2a" self.mapping_filename = os.path.join(os.path.dirname(__file__), 'maps', font + ".map") self.rulesDict = self.LoadRules() while index < len(unicode_text): '''കൂട്ടക്ഷരങ്ങള്ക്കൊരു കുറുക്കുവഴി''' for charNo in [3, 2, 1]: letter = unicode_text[index:index + charNo] if letter in self.rulesDict: ascii_letter = self.rulesDict[letter] letter = letter.encode('utf-8') ''' കിട്ടിയ അക്ഷരങ്ങളുടെ അപ്പുറത്തും ഇപ്പുറത്തും സ്വരചിഹ്നങ്ങള് ഫിറ്റ് ചെയ്യാനുള്ള ബദ്ധപ്പാട് ''' if letter == 'ൈ': # പിറകില് രണ്ടു സാധനം പിടിപ്പിക്കുക ascii_text = ascii_text[:-1] + ascii_letter + \ ascii_text[-1:] elif (letter == 'ോ') | (letter == 'ൊ') \ | (letter == 'ൌ'): # മുമ്പിലൊന്നും പിറകിലൊന്നും ascii_text = ascii_text[:-1] + ascii_letter[0] + \ ascii_text[-1:] + ascii_letter[1] elif (letter == 'െ') | (letter == 'േ') | \ (letter == '്ര'): # പിറകിലൊന്നുമാത്രം ascii_text = ascii_text[:-1] + ascii_letter + \ ascii_text[-1:] else: ascii_text = ascii_text + ascii_letter index = index + charNo break else: if (charNo == 1): index = index + 1 ascii_text = ascii_text + letter break '''നോക്കിയിട്ടു കിട്ടുന്നില്ല ബായി''' ascii_letter = letter # ascii_text = ascii_text + ascii_letter # index = index+1 return ascii_text def ASCII2Unicode(self, ascii_text, font): ascii_text = self.normalizer.normalize(ascii_text) index = 0 post_index = 0 prebase_letter = "" postbase_letter = "" # "്യ", "്വ" unicode_text = "" next_ucode_letter = "" self.direction = "a2u" self.mapping_filename = os.path.join(os.path.dirname(__file__), 'maps', font + ".map") self.rulesDict = self.LoadRules() while index < len(ascii_text): for charNo in [2, 1]: letter = ascii_text[index:index + charNo] if letter in self.rulesDict: unicode_letter = self.rulesDict[letter] if (self.isPrebase(unicode_letter)): # സ്വരചിഹ്നമാണോ? prebase_letter = unicode_letter else: # സ്വരചിഹ്നമല്ല ''' എങ്കില് വ്യഞ്ജനത്തിനു ശേഷം പോസ്റ്റ്-ബേസ് ഉണ്ടോ എന്നു നോക്കൂ ''' post_index = index + charNo if post_index < len(ascii_text): letter = ascii_text[post_index] if letter in self.rulesDict: next_ucode_letter = self.rulesDict[letter] if self.isPostbase(next_ucode_letter): postbase_letter = next_ucode_letter index = index + 1 if ((unicode_letter.encode('utf-8') == "എ") | (unicode_letter.encode('utf-8') == "ഒ")): unicode_text = unicode_text + postbase_letter + \ self.getVowelSign(prebase_letter, unicode_letter) else: unicode_text = unicode_text + unicode_letter + \ postbase_letter + prebase_letter prebase_letter = "" postbase_letter = "" index = index + charNo break else: if charNo == 1: unicode_text = unicode_text + letter index = index + 1 break unicode_letter = letter return unicode_text # മതം മാറ്റി തിരിച്ചു കൊടുക്ക്വാ ! def getVowelSign(self, vowel_letter, vowel_sign_letter): vowel = vowel_letter.encode('utf-8') vowel_sign = vowel_sign_letter.encode('utf-8') if vowel == "എ": if vowel_sign == "െ": return "ഐ" if vowel == "ഒ": if vowel_sign == "ാ": return "ഓ" if vowel_sign == "ൗ": return "ഔ" return (vowel_letter + vowel_sign_letter) def isPrebase(self, letter): ''' ഇതെന്തിനാന്നു ചോദിച്ചാ, ഈ അക്ഷരങ്ങളുടെ ഇടതു വശത്തെഴുതുന്ന സ്വര ചിഹ്നങ്ങളുണ്ടല്ലോ? അവ ആസ്കി തരികിടയില് എഴുതുന്നതു് ഇടതു വശത്തു തന്നെയാ. യൂണിക്കോഡില് അക്ഷരത്തിനു ശേഷവും അപ്പൊ ആ വക സംഭവങ്ങളെ തിരിച്ചറിയാനാണു് ഈ സംഭവം. "തരികിട തരികിടോ ധീംതരികിട" (തരികിട തരികിടയാല്) എന്നു പയ്യന്റെ ഗുരു പയ്യഗുരു പയ്യെ മൊഴിഞ്ഞിട്ടുണ്ടു്. ''' unicode_letter = letter.encode('utf-8') if ((unicode_letter == "േ") | (unicode_letter == "ൈ") | (unicode_letter == "ൊ") | (unicode_letter == "ോ") | (unicode_letter == "ൌ") | (unicode_letter == "്ര") | (unicode_letter == "െ")): return True # "ഇതു സത്യം... അ...സത്യം.... അസത്യം...!" else: return False def isPostbase(self, letter): ''' "ക്യ" എന്നതിലെ "്യ", "ക്വ" എന്നതിലെ "്വ" എന്നിവ പോസ്റ്റ്-ബേസ് ആണ്. "ത്യേ" എന്നത് ആസ്കിയില് "ഏ+ത+്യ" എന്നാണ് എഴുതുന്നത്. അപ്പോള് വ്യഞ്ജനം കഴിഞ്ഞ് പോസ്റ്റ്-ബേസ് ഉണ്ടെങ്കില് വ്യഞ്ജനം+പോസ്റ്റ്-ബേസ് കഴിഞ്ഞേ പ്രീ-ബേസ് ചേര്ക്കാവൂ! ഹൊ, പയ്യന് പാണിനീശിഷ്യനാണ്!! ''' unicode_letter = letter.encode('utf-8') if ((unicode_letter == "്യ") | (unicode_letter == "്വ")): return True else: return False def LoadRules(self): ''' ഈ സംഭവമാണു് മാപ്പിങ്ങ് ഫയല് എടുത്തു് വായിച്ചു പഠിക്കുന്നതു്. ''' # if(self.rulesDict): # return self.rulesDict rules_dict = dict() line = [] line_number = 0 rules_file = codecs.open(self.mapping_filename, encoding='utf-8', errors='ignore') while True: ''' ലൈന് നമ്പര് , മാപ്പിങ്ങ് ഫയലില് തെറ്റുണ്ടെങ്കില് പറയാന് ആവശ്യാണു് ''' line_number = line_number + 1 original_text = rules_file.readline() try: text = unicode(original_text) # noqa: F821 except BaseException: text = original_text if text == "": break '''കമന്റടിച്ചേ മത്യാവൂന്നു വെച്ചാ ആവാം. ഒട്ടും മുഷിയില്ല്യ''' if text[0] == '#': continue ''' കമന്റടി പതിവുപോലെ മൈന്റ് ചെയ്യണ്ട ഒന്നും കണ്ടില്യാ കേട്ടില്യാന്നു വെച്ചു നടന്നോളൂ(മനസ്സില് ചിരിച്ചോളൂ) ''' line = text.strip() if (line == ""): continue '''ലൈനൊന്നും ല്യാ, മോശം.. ങും പോട്ടെ. വേറെ ലൈന് പിടിക്കാം''' if (len(line.split("=")) != 2): '''എന്തോ പ്രശ്നണ്ടു്. ന്നാ അതങ്ങടു തുറന്നു പറഞ്ഞേക്കാം''' print( "Error: Syntax Error in the Ascii to Unicode Map " "in line number ", line_number) print("Line: " + text) '''പരിപാടി നിര്ത്താം ''' return 2 # Error - Syntax error in Mapping file '''ഇടതന്''' lhs = line.split("=")[0] '''വലതന്''' rhs = line.split("=")[1] '''ഇതിനിടക്കിനി മൂന്നാമനു സ്കോപ്പിണ്ടോ? ''' '''മറക്കാതെ ഇരിക്കട്ടെ. ആവശ്യം വരും ''' lhs = lhs.strip() rhs = rhs.strip() if self.direction == 'a2u': rules_dict[lhs] = rhs else: rules_dict[rhs] = lhs return rules_dict def get_module_name(self): return "Payyans Unicode-ASCII Converter" def get_info(self): return "ASCII data - Unicode Convertor based on font maps"
class Transliterator: """ Transliteration class, instantiate this to get access to the transliteration methods """ def __init__(self): self.cmu = CMUDict() self.normalizer = Normalizer() def transliterate_en_ml(self, word): """ :param word: The word to be transliterated. :type word: str. :returns: the translated word. Transliterate English to Malayalam with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word, "ml_IN") def transliterate_en_kn(self, word): """ :param word: The word to be transliterated. :type word: str. :returns: the translated word. Transliterate English to Kannada with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word, "kn_IN") def transliterate_en_hi(self, word): """ :param word: The word to be transliterated. :type word: str. :returns: the translated word. Transliterate English to Hindi with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word, "hi_IN") def transliterate_en_xx(self, word, target_lang): """ :param word: The word to be transliterated. :type word: str. :param target_lang: The language into which word has to be transliterated. :type target_lang: str. :returns: the translated word. Transliterate English to any Indian Language. """ if target_lang == "en_IN" or target_lang == "en_US": return word if target_lang == "kn_IN": tx_str = self.transliterate_en_kn(word) return tx_str elif target_lang == "hi_IN": tx_str = self.transliterate_en_hi(word) return tx_str else: tx_str = self.transliterate_en_ml(word) if target_lang == "ml_IN": return tx_str #chain it through indic indic transliteratioin #first remove malayalam specific zwj tx_str = tx_str.replace(u'', '') # remove instances of zwnj if tx_str[-1:] == u'്' and \ (target_lang == "hi_IN" or target_lang == "gu_IN" or target_lang == "bn_IN"): tx_str = tx_str[:-(len(u'്'))] # remove the last virama' return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang) def transliterate_xx_en(self, word, src_lang): """ :param word: The word to be transliterated. :type word: str. :param src_lang: The language of the word. :type src_lang: str. :returns: the translated word. Transliterate Indian Language to English. """ if src_lang == "en_IN" or src_lang == "en_US": return word # TODO: the function is generic now so no need of testing the lanuguage # but since the indic_en contains only for kn_IN and ml_IN we need this # check. # Add all indic language to indic_en # remplace this block with single call to indic_en function if src_lang == "kn_IN": return self.transliterate_indic_en(word, src_lang) if not src_lang == "ml_IN": word = self.transliterate_indic_indic(word, src_lang, "ml_IN") return self.transliterate_indic_en(word, "ml_IN") def transliterate_iso15919(self, word, src_language): tx_str = "" index = 0 word_length = len(word) for chr in word: index += 1 offset = ord(chr) - lang_bases[src_language] #76 is the virama offset for all indian languages from its base if offset >= 61 and offset <= 76: tx_str = tx_str[:-1] # remove the last 'a' if offset > 0 and offset <= 128: tx_str = tx_str + charmap["ISO15919"][offset] #delete the inherent 'a' at the end of the word from hindi if tx_str[-1:] == 'a' and (src_language == "hi_IN" or src_language == "gu_IN" or src_language == "bn_IN"): if word_length == index and word_length > 1: # if last letter tx_str = tx_str[:-1] # remove the last 'a' return tx_str.decode("utf-8") def transliterate_ipa(self, word, src_language): """ Transliterate the given word in src_language to IPA - International Phonetical Alphabet notation. :param word: The word to be transliterated. :type word: str. :returns: the translated word. :param src_lang: The language of the word. :type src_lang: str. """ tx_str = "" index = 0 word_length = len(word) for chr in word: index += 1 if ord(chr) < 255: # ASCII characters + English tx_str += chr continue offset = ord(chr) - lang_bases[src_language] #76 is the virama offset for all indian languages from its base if offset >= 61 and offset <= 76: tx_str = tx_str[:-(len('ə'))] # remove the last 'ə' if offset > 0 and offset <= 128: tx_str = tx_str + charmap_transphon["IPA"][offset] #delete the inherent 'a' at the end of the word from hindi if tx_str[-1:] == 'ə' and \ (src_language == "hi_IN" or src_language == "gu_IN" or src_language == "bn_IN") and \ (word_length == index and word_length > 1): tx_str = tx_str[:-(len('ə'))] # if last letter # remove the last 'a' return tx_str.decode("utf-8") def _malayalam_fixes(self, text): try: text = text.replace(u"മ് ", u"ം ") text = text.replace(u"മ്,", u"ം,") text = text.replace(u"മ്.", u"ം.") text = text.replace(u"മ്)", u"ം)") text = text.replace(u"ഩ", u"ന") text = text.replace(u"", u".") # danda by fullstop except: pass return text def transliterate_indic_indic(self, word, src_lang, target_lang): """ Transliterate from an Indian languge word to another indian language word :param word: The word to be transliterated. :type word: str. :param src_lang: The language of the word. :type src_lang: str. :param target_lang: The language into which word has to be transliterated. :type target_lang: str. :returns: the translated word. """ index = 0 tx_str = "" word = self.normalizer.normalize(word) if src_lang == "ml_IN" and target_lang != "ml_IN": word = word.replace(u"\u200C", u"") word = word.replace(u"\u200D", u"") #replace all samvruthokaram by u vowels word = word.replace(u"ു്", u"") for character in word: index += 1 if character in string.punctuation or (ord(character) <= 2304 and ord(character) >= 3071): tx_str = tx_str + character continue offset = ord(character) + self.getOffset(src_lang, target_lang) if (offset > 0): tx_str = tx_str + chr(offset) #schwa deletion baseoffset = offset - lang_bases[target_lang] #76 : virama if (index == len(word) and baseoffset == 76 and (target_lang == "hi_IN" or target_lang == "gu_IN" or target_lang == "pa_IN" or target_lang == "bn_IN")): #TODO Add more languages having schwa deletion characteristic tx_str = tx_str[:-(len(character))] # remove the last 'a' if target_lang == "ml_IN" and src_lang == "ta_IN": tx_str = tx_str.replace(u"ഩ", u"ന") if target_lang == "ta_IN": tx_str = tx_str.replace(u'\u0B96', u"க") tx_str = tx_str.replace(u'\u0B97', u"க") tx_str = tx_str.replace(u'\u0B98', u"க") tx_str = tx_str.replace(u'\u0B9B', u"ச") tx_str = tx_str.replace(u'\u0B9D', u"ச") tx_str = tx_str.replace(u'\u0BA0', u"ட") tx_str = tx_str.replace(u'\u0BA1', u"ட") tx_str = tx_str.replace(u'\u0BA2', u"ட") tx_str = tx_str.replace(u'\u0BA5', u"த") tx_str = tx_str.replace(u'\u0BA6', u"த") tx_str = tx_str.replace(u'\u0BA7', u"த") tx_str = tx_str.replace(u'\u0BAB', u"ப") tx_str = tx_str.replace(u'\u0BAC', u"ப") tx_str = tx_str.replace(u'\u0BAD', u"ப") tx_str = tx_str.replace(u'\u0BC3', u"ிரு") tx_str = tx_str.replace(u'ஂ', u'ம்') #If target is malayalam, we need to add the virama if ((target_lang == "ml_IN") and (src_lang == "hi_IN" or src_lang == "gu_IN" or src_lang == "pa_IN" or src_lang == "bn_IN") and tx_str[-1].isalpha()): tx_str = tx_str + u"്" return tx_str def transliterate_indic_en(self, word, src_lang): """ Arguments: - `self`: - `word`: Word to be transliterated (sentence) - `src_lang`: Language from which we need to transilterate """ # Get all the language related stuffs dictionary = indic_en.get_dictionary_for(src_lang) vowels = indic_en.get_vowels_for(src_lang) vowel_signs = indic_en.get_vowel_signs_for(src_lang) virama = indic_en.get_virama_for(src_lang) anuswara = indic_en.get_anuswara_for(src_lang) word_length = len(word) index = 0 tx_string = "" while index < word_length: # If current charachter is a punctuation symbol # skip it. # Added to avoid getting extra 'a' to the begining # of word next to punctuation symbol # if word[index] in string.punctuation: tx_string += word[index] index += 1 continue # Virama = conjucter if word[index] == virama: index += 1 continue # Get english equivalaent of the charachter. try: tx_string += dictionary[word[index]] except KeyError: # If charachter isn't present in the dict # just append the charachter to string # This case is now handled by punctuation checking tx_string += word[index] if index + 1 < word_length and not word[index + 1] in vowel_signs \ and word[index + 1] in dictionary \ and not word[index] in vowels \ and not word[index] in vowel_signs: tx_string += 'a' if index + 1 == word_length and not word[index] in vowel_signs \ and word[index] in dictionary: tx_string += 'a' #handle am sign if index + 1 < word_length and word[index + 1] == anuswara \ and not word[index] in vowel_signs: tx_string += 'a' index += 1 return tx_string def transliterate(self, text, target_lang_code): """ :param text: The text to be transliterated. :type text: str. :param target_lang_code: The language into which word has to be transliterated. :type target_lang_code: str. :returns: the transliterated text. The transliteration functioon which can transliterate text to the supported target languages. """ tx_str = "" lines = text.split("\n") for line in lines: words = line.split(" ") for word in words: if (word.strip() > ""): try: src_lang_code = detect_lang(word)[word] except: tx_str = tx_str + " " + word continue # FIXME if target_lang_code == "ISO15919": tx_str = (tx_str + self.transliterate_iso15919( word, src_lang_code) + " ") continue if target_lang_code == "IPA": tx_str = (tx_str + self.transliterate_ipa(word, src_lang_code) + " ") continue if src_lang_code == "en_US": tx_str = (tx_str + self.transliterate_en_xx( word, target_lang_code) + " ") continue if target_lang_code == "en_US" or \ target_lang_code == "en_IN": tx_str = (tx_str + self.transliterate_xx_en( word, src_lang_code) + " ") continue tx_str += self.transliterate_indic_indic( word, src_lang_code, target_lang_code) if len(line) > 1: tx_str += " " else: tx_str = tx_str + word if len(lines) > 1: tx_str += "\n" # Language specific fixes if target_lang_code == "ml_IN": tx_str = self._malayalam_fixes(tx_str) return tx_str def getOffset(self, src, target): src_id = 0 target_id = 0 try: src_id = lang_bases[src] target_id = lang_bases[target] return (target_id - src_id) except: return 0 def get_module_name(self): """ returns module name """ return "Transliterator" def get_info(self): """ Returns module info """ return "Transliterate the text between any Indian Language"
class Transliterator: """ Transliteration class, instantiate this to get access to the transliteration methods """ def __init__(self): self.cmu = CMUDict() self.normalizer = Normalizer() def transliterate_en_ml(self, word): """ :param word: The word to be transliterated. :type word: str. :returns: the translated word. Transliterate English to Malayalam with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word, "ml_IN") def transliterate_en_kn(self, word): """ :param word: The word to be transliterated. :type word: str. :returns: the translated word. Transliterate English to Kannada with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word, "kn_IN") def transliterate_en_hi(self, word): """ :param word: The word to be transliterated. :type word: str. :returns: the translated word. Transliterate English to Hindi with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word, "hi_IN") def transliterate_en_xx(self, word, target_lang): """ :param word: The word to be transliterated. :type word: str. :param target_lang: The language into which word has to be transliterated. :type target_lang: str. :returns: the translated word. Transliterate English to any Indian Language. """ if target_lang == "en_IN" or target_lang == "en_US": return word if target_lang == "kn_IN": tx_str = self.transliterate_en_kn(word) return tx_str elif target_lang == "hi_IN": tx_str = self.transliterate_en_hi(word) return tx_str else: tx_str = self.transliterate_en_ml(word) if target_lang == "ml_IN": return tx_str #chain it through indic indic transliteratioin #first remove malayalam specific zwj tx_str = tx_str.replace(u'', '') # remove instances of zwnj if tx_str[-1:] == u'്' and \ (target_lang == "hi_IN" or target_lang == "gu_IN" or target_lang == "bn_IN"): tx_str = tx_str[:-(len(u'്'))] # remove the last virama' return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang) def transliterate_xx_en(self, word, src_lang): """ :param word: The word to be transliterated. :type word: str. :param src_lang: The language of the word. :type src_lang: str. :returns: the translated word. Transliterate Indian Language to English. """ if src_lang == "en_IN" or src_lang == "en_US": return word # TODO: the function is generic now so no need of testing the lanuguage # but since the indic_en contains only for kn_IN and ml_IN we need this # check. # Add all indic language to indic_en # remplace this block with single call to indic_en function if src_lang == "kn_IN": return self.transliterate_indic_en(word, src_lang) if not src_lang == "ml_IN": word = self.transliterate_indic_indic(word, src_lang, "ml_IN") return self.transliterate_indic_en(word, "ml_IN") def transliterate_iso15919(self, word, src_language): tx_str = "" index = 0 word_length = len(word) for chr in word: index += 1 offset = ord(chr) - lang_bases[src_language] #76 is the virama offset for all indian languages from its base if offset >= 61 and offset <= 76: tx_str = tx_str[:-1] # remove the last 'a' if offset > 0 and offset <= 128: tx_str = tx_str + charmap["ISO15919"][offset] #delete the inherent 'a' at the end of the word from hindi if tx_str[-1:] == 'a' and (src_language == "hi_IN" or src_language == "gu_IN" or src_language == "bn_IN"): if word_length == index and word_length > 1: # if last letter tx_str = tx_str[:-1] # remove the last 'a' return tx_str .decode("utf-8") def transliterate_ipa(self, word, src_language): """ Transliterate the given word in src_language to IPA - International Phonetical Alphabet notation. :param word: The word to be transliterated. :type word: str. :returns: the translated word. :param src_lang: The language of the word. :type src_lang: str. """ tx_str = "" index = 0 word_length = len(word) for chr in word: index += 1 if ord(chr) < 255: # ASCII characters + English tx_str += chr continue offset = ord(chr) - lang_bases[src_language] #76 is the virama offset for all indian languages from its base if offset >= 61 and offset <= 76: tx_str = tx_str[:-(len('ə'))] # remove the last 'ə' if offset > 0 and offset <= 128: tx_str = tx_str + charmap_transphon["IPA"][offset] #delete the inherent 'a' at the end of the word from hindi if tx_str[-1:] == 'ə' and \ (src_language == "hi_IN" or src_language == "gu_IN" or src_language == "bn_IN") and \ (word_length == index and word_length > 1): tx_str = tx_str[:-(len('ə'))] # if last letter # remove the last 'a' return tx_str.decode("utf-8") def _malayalam_fixes(self, text): try: text = text.replace(u"മ് ", u"ം ") text = text.replace(u"മ്,", u"ം,") text = text.replace(u"മ്.", u"ം.") text = text.replace(u"മ്)", u"ം)") text = text.replace(u"ഩ", u"ന") text = text.replace(u"", u".") # danda by fullstop except: pass return text def transliterate_indic_indic(self, word, src_lang, target_lang): """ Transliterate from an Indian languge word to another indian language word :param word: The word to be transliterated. :type word: str. :param src_lang: The language of the word. :type src_lang: str. :param target_lang: The language into which word has to be transliterated. :type target_lang: str. :returns: the translated word. """ index = 0 tx_str = "" word = self.normalizer.normalize(word) if src_lang == "ml_IN" and target_lang != "ml_IN": word = word.replace(u"\u200C", u"") word = word.replace(u"\u200D", u"") #replace all samvruthokaram by u vowels word = word.replace(u"ു്", u"") for character in word: index += 1 if character in string.punctuation or (ord(character) <= 2304 and ord(character) >= 3071): tx_str = tx_str + character continue offset = ord(character) + self.getOffset(src_lang, target_lang) if(offset > 0): tx_str = tx_str + chr(offset) #schwa deletion baseoffset = offset - lang_bases[target_lang] #76 : virama if (index == len(word) and baseoffset == 76 and (target_lang == "hi_IN" or target_lang == "gu_IN" or target_lang == "pa_IN" or target_lang == "bn_IN")): #TODO Add more languages having schwa deletion characteristic tx_str = tx_str[:-(len(character))] # remove the last 'a' if target_lang == "ml_IN" and src_lang == "ta_IN": tx_str = tx_str.replace(u"ഩ", u"ന") if target_lang == "ta_IN": tx_str = tx_str.replace(u'\u0B96', u"க") tx_str = tx_str.replace(u'\u0B97', u"க") tx_str = tx_str.replace(u'\u0B98', u"க") tx_str = tx_str.replace(u'\u0B9B', u"ச") tx_str = tx_str.replace(u'\u0B9D', u"ச") tx_str = tx_str.replace(u'\u0BA0', u"ட") tx_str = tx_str.replace(u'\u0BA1', u"ட") tx_str = tx_str.replace(u'\u0BA2', u"ட") tx_str = tx_str.replace(u'\u0BA5', u"த") tx_str = tx_str.replace(u'\u0BA6', u"த") tx_str = tx_str.replace(u'\u0BA7', u"த") tx_str = tx_str.replace(u'\u0BAB', u"ப") tx_str = tx_str.replace(u'\u0BAC', u"ப") tx_str = tx_str.replace(u'\u0BAD', u"ப") tx_str = tx_str.replace(u'\u0BC3', u"ிரு") tx_str = tx_str.replace(u'ஂ', u'ம்') #If target is malayalam, we need to add the virama if ((target_lang == "ml_IN") and (src_lang == "hi_IN" or src_lang == "gu_IN" or src_lang == "pa_IN" or src_lang == "bn_IN") and tx_str[-1].isalpha()): tx_str = tx_str + u"്" return tx_str def transliterate_indic_en(self, word, src_lang): """ Arguments: - `self`: - `word`: Word to be transliterated (sentence) - `src_lang`: Language from which we need to transilterate """ # Get all the language related stuffs dictionary = indic_en.get_dictionary_for(src_lang) vowels = indic_en.get_vowels_for(src_lang) vowel_signs = indic_en.get_vowel_signs_for(src_lang) virama = indic_en.get_virama_for(src_lang) anuswara = indic_en.get_anuswara_for(src_lang) word_length = len(word) index = 0 tx_string = "" while index < word_length: # If current charachter is a punctuation symbol # skip it. # Added to avoid getting extra 'a' to the begining # of word next to punctuation symbol # if word[index] in string.punctuation: tx_string += word[index] index += 1 continue # Virama = conjucter if word[index] == virama: index += 1 continue # Get english equivalaent of the charachter. try: tx_string += dictionary[word[index]] except KeyError: # If charachter isn't present in the dict # just append the charachter to string # This case is now handled by punctuation checking tx_string += word[index] if index + 1 < word_length and not word[index + 1] in vowel_signs \ and word[index + 1] in dictionary \ and not word[index] in vowels \ and not word[index] in vowel_signs: tx_string += 'a' if index + 1 == word_length and not word[index] in vowel_signs \ and word[index] in dictionary: tx_string += 'a' #handle am sign if index + 1 < word_length and word[index + 1] == anuswara \ and not word[index] in vowel_signs: tx_string += 'a' index += 1 return tx_string def transliterate(self, text, target_lang_code): """ :param text: The text to be transliterated. :type text: str. :param target_lang_code: The language into which word has to be transliterated. :type target_lang_code: str. :returns: the transliterated text. The transliteration functioon which can transliterate text to the supported target languages. """ tx_str = "" lines = text.split("\n") for line in lines: words = line.split(" ") for word in words: if(word.strip() > ""): try: src_lang_code = detect_lang(word)[word] except: tx_str = tx_str + " " + word continue # FIXME if target_lang_code == "ISO15919": tx_str = (tx_str + self.transliterate_iso15919(word, src_lang_code) + " ") continue if target_lang_code == "IPA": tx_str = (tx_str + self.transliterate_ipa(word, src_lang_code) + " ") continue if src_lang_code == "en_US": tx_str = (tx_str + self.transliterate_en_xx(word, target_lang_code) + " ") continue if target_lang_code == "en_US" or \ target_lang_code == "en_IN": tx_str = (tx_str + self.transliterate_xx_en(word, src_lang_code) + " ") continue tx_str += self.transliterate_indic_indic(word, src_lang_code, target_lang_code) if len(line) > 1: tx_str += " " else: tx_str = tx_str + word if len(lines) > 1: tx_str += "\n" # Language specific fixes if target_lang_code == "ml_IN": tx_str = self._malayalam_fixes(tx_str) return tx_str def getOffset(self, src, target): src_id = 0 target_id = 0 try: src_id = lang_bases[src] target_id = lang_bases[target] return (target_id - src_id) except: return 0 def get_module_name(self): """ returns module name """ return "Transliterator" def get_info(self): """ Returns module info """ return "Transliterate the text between any Indian Language"