Python Normalizer.normalize Examples

Programming Language: Python

Namespace/Package Name: libindic.normalizer

Class/Type: Normalizer

Method/Function: normalize

Examples at hotexamples.com: 3

Python Normalizer.normalize - 3 examples found. These are the top rated real world Python examples of libindic.normalizer.Normalizer.normalize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Normalizer(3)

normalize(2)

Frequently Used Methods

Normalizer (3)

normalize (2)

Example #1

Show file

class Payyans():
    def __init__(self):
        self.input_filename = ""
        self.output_filename = ""
        self.mapping_filename = ""
        self.rulesDict = None
        self.pdf = 0
        self.normalizer = Normalizer()

    def Unicode2ASCII(self, unicode_text, font):
        unicode_text = self.normalizer.normalize(unicode_text)
        index = 0
        ascii_text = ""
        self.direction = "u2a"
        self.mapping_filename = os.path.join(os.path.dirname(__file__), 'maps',
                                             font + ".map")
        self.rulesDict = self.LoadRules()
        while index < len(unicode_text):
            '''കൂട്ടക്ഷരങ്ങള്‍ക്കൊരു കുറുക്കുവഴി'''
            for charNo in [3, 2, 1]:
                letter = unicode_text[index:index + charNo]
                if letter in self.rulesDict:
                    ascii_letter = self.rulesDict[letter]
                    letter = letter.encode('utf-8')
                    '''
                    കിട്ടിയ അക്ഷരങ്ങളുടെ അപ്പുറത്തും ഇപ്പുറത്തും
                    സ്വരചിഹ്നങ്ങള്‍ ഫിറ്റ് ചെയ്യാനുള്ള ബദ്ധപ്പാട്
                    '''
                    if letter == 'ൈ':  # പിറകില്‍ രണ്ടു സാധനം പിടിപ്പിക്കുക
                        ascii_text = ascii_text[:-1] + ascii_letter + \
                            ascii_text[-1:]
                    elif (letter == 'ോ') | (letter == 'ൊ') \
                            | (letter == 'ൌ'):  # മുമ്പിലൊന്നും പിറകിലൊന്നും
                        ascii_text = ascii_text[:-1] + ascii_letter[0] + \
                            ascii_text[-1:] + ascii_letter[1]
                    elif (letter == 'െ') | (letter == 'േ') | \
                            (letter == '്ര'):  # പിറകിലൊന്നുമാത്രം
                        ascii_text = ascii_text[:-1] + ascii_letter + \
                            ascii_text[-1:]
                    else:
                        ascii_text = ascii_text + ascii_letter
                    index = index + charNo
                    break
                else:
                    if (charNo == 1):
                        index = index + 1
                        ascii_text = ascii_text + letter
                        break
                    '''നോക്കിയിട്ടു കിട്ടുന്നില്ല ബായി'''
                    ascii_letter = letter
                    # ascii_text = ascii_text + ascii_letter
                    # index = index+1

        return ascii_text

    def ASCII2Unicode(self, ascii_text, font):
        ascii_text = self.normalizer.normalize(ascii_text)
        index = 0
        post_index = 0
        prebase_letter = ""
        postbase_letter = ""  # "‌‌്യ", "്വ"
        unicode_text = ""
        next_ucode_letter = ""
        self.direction = "a2u"
        self.mapping_filename = os.path.join(os.path.dirname(__file__), 'maps',
                                             font + ".map")
        self.rulesDict = self.LoadRules()
        while index < len(ascii_text):
            for charNo in [2, 1]:
                letter = ascii_text[index:index + charNo]
                if letter in self.rulesDict:
                    unicode_letter = self.rulesDict[letter]
                    if (self.isPrebase(unicode_letter)):  # സ്വരചിഹ്നമാണോ?
                        prebase_letter = unicode_letter
                    else:  # സ്വരചിഹ്നമല്ല
                        '''
                        എങ്കില്‍ വ്യഞ്ജനത്തിനു ശേഷം
                        പോസ്റ്റ്-ബേസ് ഉണ്ടോ എന്നു നോക്കൂ
                        '''
                        post_index = index + charNo
                        if post_index < len(ascii_text):
                            letter = ascii_text[post_index]
                            if letter in self.rulesDict:
                                next_ucode_letter = self.rulesDict[letter]
                                if self.isPostbase(next_ucode_letter):
                                    postbase_letter = next_ucode_letter
                                    index = index + 1
                        if ((unicode_letter.encode('utf-8') == "എ") |
                            (unicode_letter.encode('utf-8') == "ഒ")):
                            unicode_text = unicode_text + postbase_letter + \
                                self.getVowelSign(prebase_letter,
                                                  unicode_letter)
                        else:
                            unicode_text = unicode_text + unicode_letter + \
                                postbase_letter + prebase_letter
                        prebase_letter = ""
                        postbase_letter = ""
                    index = index + charNo
                    break
                else:
                    if charNo == 1:
                        unicode_text = unicode_text + letter
                        index = index + 1
                        break
                    unicode_letter = letter
        return unicode_text  # മതം മാറ്റി തിരിച്ചു കൊടുക്ക്വാ !

    def getVowelSign(self, vowel_letter, vowel_sign_letter):
        vowel = vowel_letter.encode('utf-8')
        vowel_sign = vowel_sign_letter.encode('utf-8')
        if vowel == "എ":
            if vowel_sign == "െ":
                return "ഐ"
        if vowel == "ഒ":
            if vowel_sign == "ാ":
                return "ഓ"
            if vowel_sign == "ൗ":
                return "ഔ"
        return (vowel_letter + vowel_sign_letter)

    def isPrebase(self, letter):
        '''
         ഇതെന്തിനാന്നു ചോദിച്ചാ, ഈ അക്ഷരങ്ങളുടെ ഇടതു വശത്തെഴുതുന്ന
         സ്വര ചിഹ്നങ്ങളുണ്ടല്ലോ? അവ ആസ്കി തരികിടയില്‍ എഴുതുന്നതു് ഇടതു വശത്തു
         തന്നെയാ. യൂണിക്കോഡില്‍ അക്ഷരത്തിനു ശേഷവും അപ്പൊ ആ വക സംഭവങ്ങളെ
         തിരിച്ചറിയാനാണു് ഈ സംഭവം.
        "തരികിട തരികിടോ ധീംതരികിട" (തരികിട തരികിടയാല്‍)
         എന്നു പയ്യന്റെ ഗുരു പയ്യഗുരു പയ്യെ മൊഴിഞ്ഞിട്ടുണ്ടു്.
        '''
        unicode_letter = letter.encode('utf-8')
        if ((unicode_letter == "േ") | (unicode_letter == "ൈ") |
            (unicode_letter == "ൊ") | (unicode_letter == "ോ") |
            (unicode_letter == "ൌ") | (unicode_letter == "്ര") |
            (unicode_letter == "െ")):
            return True  # "ഇതു സത്യം... അ...സത്യം.... അസത്യം...!"
        else:
            return False

    def isPostbase(self, letter):
        '''
        "ക്യ" എന്നതിലെ "്യ", "ക്വ" എന്നതിലെ "്വ" എന്നിവ പോസ്റ്റ്-ബേസ് ആണ്.
        "ത്യേ" എന്നത് ആസ്കിയില്‍ "ഏ+ത+്യ" എന്നാണ് എഴുതുന്നത്.
        അപ്പോള്‍ വ്യഞ്ജനം കഴിഞ്ഞ് പോസ്റ്റ്-ബേസ് ഉണ്ടെങ്കില്‍
        വ്യഞ്ജനം+പോസ്റ്റ്-ബേസ് കഴിഞ്ഞേ പ്രീ-ബേസ് ചേര്‍ക്കാവൂ!
        ഹൊ, പയ്യന്‍ പാണിനീശിഷ്യനാണ്!!
        '''
        unicode_letter = letter.encode('utf-8')
        if ((unicode_letter == "്യ") | (unicode_letter == "്വ")):
            return True
        else:
            return False

    def LoadRules(self):
        '''
        ഈ സംഭവമാണു് മാപ്പിങ്ങ് ഫയല്‍ എടുത്തു് വായിച്ചു പഠിക്കുന്നതു്.
        '''
        # if(self.rulesDict):
        #    return self.rulesDict
        rules_dict = dict()
        line = []
        line_number = 0
        rules_file = codecs.open(self.mapping_filename,
                                 encoding='utf-8',
                                 errors='ignore')
        while True:
            '''
            ലൈന്‍ നമ്പര്‍ , മാപ്പിങ്ങ് ഫയലില്‍ തെറ്റുണ്ടെങ്കില്‍
            പറയാന്‍ ആവശ്യാണു്
            '''
            line_number = line_number + 1
            original_text = rules_file.readline()
            try:
                text = unicode(original_text)  # noqa: F821
            except BaseException:
                text = original_text
            if text == "":
                break
            '''കമന്റടിച്ചേ മത്യാവൂന്നു വെച്ചാ ആവാം. ഒട്ടും മുഷിയില്ല്യ'''
            if text[0] == '#':
                continue
                '''
                കമന്റടി പതിവുപോലെ മൈന്റ് ചെയ്യണ്ട ഒന്നും കണ്ടില്യാ
                കേട്ടില്യാന്നു വെച്ചു നടന്നോളൂ(മനസ്സില്‍ ചിരിച്ചോളൂ)
                '''
            line = text.strip()
            if (line == ""):
                continue
                '''ലൈനൊന്നും ല്യാ, മോശം.. ങും പോട്ടെ. വേറെ ലൈന്‍ പിടിക്കാം'''
            if (len(line.split("=")) != 2):
                '''എന്തോ പ്രശ്നണ്ടു്. ന്നാ അതങ്ങടു തുറന്നു പറഞ്ഞേക്കാം'''
                print(
                    "Error: Syntax Error in the Ascii to Unicode Map "
                    "in line number ", line_number)
                print("Line: " + text)
                '''പരിപാടി നിര്‍ത്താം '''
                return 2  # Error - Syntax error in Mapping file
            '''ഇടതന്‍'''
            lhs = line.split("=")[0]
            '''വലതന്‍'''
            rhs = line.split("=")[1]
            '''ഇതിനിടക്കിനി മൂന്നാമനു സ്കോപ്പിണ്ടോ? '''
            '''മറക്കാതെ ഇരിക്കട്ടെ. ആവശ്യം വരും '''
            lhs = lhs.strip()
            rhs = rhs.strip()
            if self.direction == 'a2u':
                rules_dict[lhs] = rhs
            else:
                rules_dict[rhs] = lhs
        return rules_dict

    def get_module_name(self):
        return "Payyans Unicode-ASCII Converter"

    def get_info(self):
        return "ASCII data - Unicode Convertor based on font maps"

Example #2

Show file

class Transliterator:
    """
    Transliteration class, instantiate this to get access  to the transliteration methods
    """
    def __init__(self):
        self.cmu = CMUDict()
        self.normalizer = Normalizer()

    def transliterate_en_ml(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Malayalam with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "ml_IN")

    def transliterate_en_kn(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Kannada with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "kn_IN")

    def transliterate_en_hi(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Hindi with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "hi_IN")

    def transliterate_en_xx(self, word, target_lang):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :param target_lang: The language into which word has to be transliterated.
        :type target_lang: str.
        :returns: the translated word.

        Transliterate English to any Indian Language.
        """
        if target_lang == "en_IN" or target_lang == "en_US":
            return word
        if target_lang == "kn_IN":
            tx_str = self.transliterate_en_kn(word)
            return tx_str
        elif target_lang == "hi_IN":
            tx_str = self.transliterate_en_hi(word)
            return tx_str
        else:
            tx_str = self.transliterate_en_ml(word)

        if target_lang == "ml_IN":
            return tx_str
        #chain it through indic indic transliteratioin
        #first remove malayalam specific zwj
        tx_str = tx_str.replace(u'‍', '')  # remove instances of zwnj
        if tx_str[-1:] == u'്' and \
           (target_lang == "hi_IN"
            or target_lang == "gu_IN"
            or target_lang == "bn_IN"):
            tx_str = tx_str[:-(len(u'്'))]
        # remove the last virama'
        return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang)

    def transliterate_xx_en(self, word, src_lang):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :param src_lang: The language of the word.
        :type src_lang: str.
        :returns: the translated word.

        Transliterate Indian Language to English.
        """
        if src_lang == "en_IN" or src_lang == "en_US":
            return word

        # TODO: the function is generic now so no need of testing the lanuguage
        # but since the indic_en contains only for kn_IN and ml_IN we need this
        # check.
        # Add all indic language to indic_en
        # remplace this block with single call to indic_en function
        if src_lang == "kn_IN":
            return self.transliterate_indic_en(word, src_lang)
        if not src_lang == "ml_IN":
            word = self.transliterate_indic_indic(word, src_lang, "ml_IN")

        return self.transliterate_indic_en(word, "ml_IN")

    def transliterate_iso15919(self, word, src_language):
        tx_str = ""
        index = 0
        word_length = len(word)
        for chr in word:
            index += 1
            offset = ord(chr) - lang_bases[src_language]
            #76 is the virama offset for all indian languages from its base
            if offset >= 61 and offset <= 76:
                tx_str = tx_str[:-1]  # remove the last 'a'
            if offset > 0 and offset <= 128:
                tx_str = tx_str + charmap["ISO15919"][offset]
            #delete the inherent 'a' at the end of the word from hindi
            if tx_str[-1:] == 'a' and (src_language == "hi_IN" or src_language
                                       == "gu_IN" or src_language == "bn_IN"):
                if word_length == index and word_length > 1:  # if last letter
                    tx_str = tx_str[:-1]  # remove the last 'a'
        return tx_str.decode("utf-8")

    def transliterate_ipa(self, word, src_language):
        """
        Transliterate the given word in src_language to
        IPA - International Phonetical Alphabet notation.

        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.
        :param src_lang: The language of the word.
        :type src_lang: str.
        """
        tx_str = ""
        index = 0
        word_length = len(word)
        for chr in word:
            index += 1
            if ord(chr) < 255:  # ASCII characters + English
                tx_str += chr
                continue
            offset = ord(chr) - lang_bases[src_language]
            #76 is the virama offset for all indian languages from its base
            if offset >= 61 and offset <= 76:
                tx_str = tx_str[:-(len('ə'))]  # remove the last 'ə'
            if offset > 0 and offset <= 128:
                tx_str = tx_str + charmap_transphon["IPA"][offset]
            #delete the inherent 'a' at the end of the word from hindi
            if tx_str[-1:] == 'ə' and \
               (src_language == "hi_IN"
                or src_language == "gu_IN"
                or src_language == "bn_IN") and \
               (word_length == index
                and word_length > 1):
                tx_str = tx_str[:-(len('ə'))]
            # if last letter
            # remove the last 'a'
        return tx_str.decode("utf-8")

    def _malayalam_fixes(self, text):
        try:
            text = text.replace(u"മ് ", u"ം ")
            text = text.replace(u"മ്,", u"ം,")
            text = text.replace(u"മ്.", u"ം.")
            text = text.replace(u"മ്)", u"ം)")
            text = text.replace(u"ഩ", u"ന")
            text = text.replace(u"൤", u".")  # danda by fullstop
        except:
            pass
        return text

    def transliterate_indic_indic(self, word, src_lang, target_lang):
        """
        Transliterate from an Indian languge word
        to another indian language word

        :param word: The word to be transliterated.
        :type word: str.
        :param src_lang: The language of the word.
        :type src_lang: str.
        :param target_lang: The language into which word has to be transliterated.
        :type target_lang: str.
        :returns: the translated word.

        """
        index = 0
        tx_str = ""
        word = self.normalizer.normalize(word)
        if src_lang == "ml_IN" and target_lang != "ml_IN":
            word = word.replace(u"\u200C", u"")
            word = word.replace(u"\u200D", u"")
            #replace all samvruthokaram by u vowels
            word = word.replace(u"ു്", u"")

        for character in word:
            index += 1
            if character in string.punctuation or (ord(character) <= 2304
                                                   and ord(character) >= 3071):
                tx_str = tx_str + character
                continue
            offset = ord(character) + self.getOffset(src_lang, target_lang)
            if (offset > 0):
                tx_str = tx_str + chr(offset)
            #schwa deletion
            baseoffset = offset - lang_bases[target_lang]
            #76 : virama
            if (index == len(word) and baseoffset == 76
                    and (target_lang == "hi_IN" or target_lang == "gu_IN"
                         or target_lang == "pa_IN" or target_lang == "bn_IN")):
                #TODO Add more languages having schwa deletion characteristic
                tx_str = tx_str[:-(len(character))]  # remove the last 'a'

            if target_lang == "ml_IN" and src_lang == "ta_IN":
                tx_str = tx_str.replace(u"ഩ", u"ന")

            if target_lang == "ta_IN":
                tx_str = tx_str.replace(u'\u0B96', u"க")
                tx_str = tx_str.replace(u'\u0B97', u"க")
                tx_str = tx_str.replace(u'\u0B98', u"க")
                tx_str = tx_str.replace(u'\u0B9B', u"ச")
                tx_str = tx_str.replace(u'\u0B9D', u"ச")
                tx_str = tx_str.replace(u'\u0BA0', u"ட")
                tx_str = tx_str.replace(u'\u0BA1', u"ட")
                tx_str = tx_str.replace(u'\u0BA2', u"ட")
                tx_str = tx_str.replace(u'\u0BA5', u"த")
                tx_str = tx_str.replace(u'\u0BA6', u"த")
                tx_str = tx_str.replace(u'\u0BA7', u"த")
                tx_str = tx_str.replace(u'\u0BAB', u"ப")
                tx_str = tx_str.replace(u'\u0BAC', u"ப")
                tx_str = tx_str.replace(u'\u0BAD', u"ப")
                tx_str = tx_str.replace(u'\u0BC3', u"ிரு")
                tx_str = tx_str.replace(u'ஂ', u'ம்')
        #If target is malayalam, we need to add the virama
        if ((target_lang == "ml_IN")
                and (src_lang == "hi_IN" or src_lang == "gu_IN"
                     or src_lang == "pa_IN" or src_lang == "bn_IN")
                and tx_str[-1].isalpha()):
            tx_str = tx_str + u"്"
        return tx_str

    def transliterate_indic_en(self, word, src_lang):
        """
        Arguments:
        - `self`:
        - `word`: Word to be transliterated (sentence)
        - `src_lang`: Language from which we need to transilterate
        """

        # Get all the language related stuffs
        dictionary = indic_en.get_dictionary_for(src_lang)
        vowels = indic_en.get_vowels_for(src_lang)
        vowel_signs = indic_en.get_vowel_signs_for(src_lang)
        virama = indic_en.get_virama_for(src_lang)
        anuswara = indic_en.get_anuswara_for(src_lang)

        word_length = len(word)
        index = 0
        tx_string = ""
        while index < word_length:

            # If current charachter is a punctuation symbol
            # skip it.
            # Added to avoid getting extra 'a' to the begining
            # of word next to punctuation symbol
            #

            if word[index] in string.punctuation:
                tx_string += word[index]
                index += 1
                continue

            # Virama = conjucter
            if word[index] == virama:
                index += 1
                continue

            # Get english equivalaent of the charachter.
            try:
                tx_string += dictionary[word[index]]
            except KeyError:
                # If charachter isn't present in the dict
                # just append the charachter to string
                # This case is now handled by punctuation checking
                tx_string += word[index]

            if index + 1 < word_length and not word[index + 1] in vowel_signs \
                    and word[index + 1] in dictionary \
                    and not word[index] in vowels \
                    and not word[index] in vowel_signs:
                tx_string += 'a'

            if index + 1 == word_length and not word[index] in vowel_signs \
                    and word[index] in dictionary:
                tx_string += 'a'

            #handle am sign
            if index + 1 < word_length and word[index + 1] == anuswara \
                    and not word[index] in vowel_signs:
                tx_string += 'a'
            index += 1
        return tx_string

    def transliterate(self, text, target_lang_code):
        """
        :param text: The text to be transliterated.
        :type text: str.
        :param target_lang_code: The language into which word has to be transliterated.
        :type target_lang_code: str.
        :returns: the transliterated text.

        The transliteration functioon which can transliterate text to the
        supported target languages.

        """
        tx_str = ""
        lines = text.split("\n")
        for line in lines:
            words = line.split(" ")
            for word in words:
                if (word.strip() > ""):
                    try:
                        src_lang_code = detect_lang(word)[word]
                    except:
                        tx_str = tx_str + " " + word
                        continue  # FIXME

                    if target_lang_code == "ISO15919":
                        tx_str = (tx_str + self.transliterate_iso15919(
                            word, src_lang_code) + " ")
                        continue

                    if target_lang_code == "IPA":
                        tx_str = (tx_str +
                                  self.transliterate_ipa(word, src_lang_code) +
                                  " ")
                        continue

                    if src_lang_code == "en_US":
                        tx_str = (tx_str + self.transliterate_en_xx(
                            word, target_lang_code) + " ")
                        continue

                    if target_lang_code == "en_US" or \
                            target_lang_code == "en_IN":
                        tx_str = (tx_str + self.transliterate_xx_en(
                            word, src_lang_code) + " ")
                        continue

                    tx_str += self.transliterate_indic_indic(
                        word, src_lang_code, target_lang_code)

                    if len(line) > 1:
                        tx_str += " "

                else:
                    tx_str = tx_str + word
            if len(lines) > 1:
                tx_str += "\n"
        # Language specific fixes
        if target_lang_code == "ml_IN":
            tx_str = self._malayalam_fixes(tx_str)
        return tx_str

    def getOffset(self, src, target):
        src_id = 0
        target_id = 0
        try:
            src_id = lang_bases[src]
            target_id = lang_bases[target]
            return (target_id - src_id)
        except:
            return 0

    def get_module_name(self):
        """
        returns module name
        """
        return "Transliterator"

    def get_info(self):
        """
        Returns module info
        """
        return "Transliterate the text between any Indian Language"

Example #3

Show file

File: core.py Project: libindic/Transliteration

class Transliterator:
    """
    Transliteration class, instantiate this to get access  to the transliteration methods
    """
    def __init__(self):
        self.cmu = CMUDict()
        self.normalizer = Normalizer()

    def transliterate_en_ml(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Malayalam with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "ml_IN")

    def transliterate_en_kn(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Kannada with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "kn_IN")

    def transliterate_en_hi(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Hindi with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "hi_IN")

    def transliterate_en_xx(self, word, target_lang):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :param target_lang: The language into which word has to be transliterated.
        :type target_lang: str.
        :returns: the translated word.

        Transliterate English to any Indian Language.
        """
        if target_lang == "en_IN" or target_lang == "en_US":
            return word
        if target_lang == "kn_IN":
            tx_str = self.transliterate_en_kn(word)
            return tx_str
        elif target_lang == "hi_IN":
            tx_str = self.transliterate_en_hi(word)
            return tx_str
        else:
            tx_str = self.transliterate_en_ml(word)

        if target_lang == "ml_IN":
            return tx_str
        #chain it through indic indic transliteratioin
        #first remove malayalam specific zwj
        tx_str = tx_str.replace(u'‍', '')  # remove instances of zwnj
        if tx_str[-1:] == u'്' and \
           (target_lang == "hi_IN"
            or target_lang == "gu_IN"
            or target_lang == "bn_IN"): tx_str = tx_str[:-(len(u'്'))]
        # remove the last virama'
        return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang)

    def transliterate_xx_en(self, word, src_lang):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :param src_lang: The language of the word.
        :type src_lang: str.
        :returns: the translated word.

        Transliterate Indian Language to English.
        """
        if src_lang == "en_IN" or src_lang == "en_US":
            return word

        # TODO: the function is generic now so no need of testing the lanuguage
        # but since the indic_en contains only for kn_IN and ml_IN we need this
        # check.
        # Add all indic language to indic_en
        # remplace this block with single call to indic_en function
        if src_lang == "kn_IN":
            return self.transliterate_indic_en(word, src_lang)
        if not src_lang == "ml_IN":
            word = self.transliterate_indic_indic(word, src_lang, "ml_IN")

        return self.transliterate_indic_en(word, "ml_IN")

    def transliterate_iso15919(self, word, src_language):
        tx_str = ""
        index = 0
        word_length = len(word)
        for chr in word:
            index += 1
            offset = ord(chr) - lang_bases[src_language]
            #76 is the virama offset for all indian languages from its base
            if offset >= 61 and offset <= 76:
                tx_str = tx_str[:-1]  # remove the last 'a'
            if offset > 0 and offset <= 128:
                tx_str = tx_str + charmap["ISO15919"][offset]
            #delete the inherent 'a' at the end of the word from hindi
            if tx_str[-1:] == 'a' and (src_language == "hi_IN"
                                       or src_language == "gu_IN"
                                       or src_language == "bn_IN"):
                if word_length == index and word_length > 1:  # if last letter
                    tx_str = tx_str[:-1]  # remove the last 'a'
        return tx_str .decode("utf-8")

    def transliterate_ipa(self, word, src_language):
        """
        Transliterate the given word in src_language to
        IPA - International Phonetical Alphabet notation.

        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.
        :param src_lang: The language of the word.
        :type src_lang: str.
        """
        tx_str = ""
        index = 0
        word_length = len(word)
        for chr in word:
            index += 1
            if ord(chr) < 255:  # ASCII characters + English
                tx_str += chr
                continue
            offset = ord(chr) - lang_bases[src_language]
            #76 is the virama offset for all indian languages from its base
            if offset >= 61 and offset <= 76:
                tx_str = tx_str[:-(len('ə'))]  # remove the last 'ə'
            if offset > 0 and offset <= 128:
                tx_str = tx_str + charmap_transphon["IPA"][offset]
            #delete the inherent 'a' at the end of the word from hindi
            if tx_str[-1:] == 'ə' and \
               (src_language == "hi_IN"
                or src_language == "gu_IN"
                or src_language == "bn_IN") and \
               (word_length == index
                and word_length > 1): tx_str = tx_str[:-(len('ə'))]
            # if last letter
            # remove the last 'a'
        return tx_str.decode("utf-8")

    def _malayalam_fixes(self, text):
        try:
            text = text.replace(u"മ് ", u"ം ")
            text = text.replace(u"മ്,", u"ം,")
            text = text.replace(u"മ്.", u"ം.")
            text = text.replace(u"മ്)", u"ം)")
            text = text.replace(u"ഩ", u"ന")
            text = text.replace(u"൤", u".")  # danda by fullstop
        except:
            pass
        return text

    def transliterate_indic_indic(self, word, src_lang, target_lang):
        """
        Transliterate from an Indian languge word
        to another indian language word

        :param word: The word to be transliterated.
        :type word: str.
        :param src_lang: The language of the word.
        :type src_lang: str.
        :param target_lang: The language into which word has to be transliterated.
        :type target_lang: str.
        :returns: the translated word.

        """
        index = 0
        tx_str = ""
        word = self.normalizer.normalize(word)
        if src_lang == "ml_IN" and target_lang != "ml_IN":
            word = word.replace(u"\u200C", u"")
            word = word.replace(u"\u200D", u"")
            #replace all samvruthokaram by u vowels
            word = word.replace(u"ു്", u"")

        for character in word:
            index += 1
            if character in string.punctuation or (ord(character) <= 2304
                                             and ord(character) >= 3071):
                tx_str = tx_str + character
                continue
            offset = ord(character) + self.getOffset(src_lang, target_lang)
            if(offset > 0):
                tx_str = tx_str + chr(offset)
            #schwa deletion
            baseoffset = offset - lang_bases[target_lang]
            #76 : virama
            if (index == len(word) and baseoffset == 76
                    and (target_lang == "hi_IN"
                         or target_lang == "gu_IN"
                         or target_lang == "pa_IN"
                         or target_lang == "bn_IN")):
                #TODO Add more languages having schwa deletion characteristic
                tx_str = tx_str[:-(len(character))]  # remove the last 'a'

            if target_lang == "ml_IN" and src_lang == "ta_IN":
                tx_str = tx_str.replace(u"ഩ", u"ന")

            if target_lang == "ta_IN":
                tx_str = tx_str.replace(u'\u0B96', u"க")
                tx_str = tx_str.replace(u'\u0B97', u"க")
                tx_str = tx_str.replace(u'\u0B98', u"க")
                tx_str = tx_str.replace(u'\u0B9B', u"ச")
                tx_str = tx_str.replace(u'\u0B9D', u"ச")
                tx_str = tx_str.replace(u'\u0BA0', u"ட")
                tx_str = tx_str.replace(u'\u0BA1', u"ட")
                tx_str = tx_str.replace(u'\u0BA2', u"ட")
                tx_str = tx_str.replace(u'\u0BA5', u"த")
                tx_str = tx_str.replace(u'\u0BA6', u"த")
                tx_str = tx_str.replace(u'\u0BA7', u"த")
                tx_str = tx_str.replace(u'\u0BAB', u"ப")
                tx_str = tx_str.replace(u'\u0BAC', u"ப")
                tx_str = tx_str.replace(u'\u0BAD', u"ப")
                tx_str = tx_str.replace(u'\u0BC3', u"ிரு")
                tx_str = tx_str.replace(u'ஂ', u'ம்')
        #If target is malayalam, we need to add the virama
        if ((target_lang == "ml_IN") and
            (src_lang == "hi_IN"
             or src_lang == "gu_IN"
             or src_lang == "pa_IN"
             or src_lang == "bn_IN") and tx_str[-1].isalpha()):
            tx_str = tx_str + u"്"
        return tx_str

    def transliterate_indic_en(self, word, src_lang):
        """
        Arguments:
        - `self`:
        - `word`: Word to be transliterated (sentence)
        - `src_lang`: Language from which we need to transilterate
        """

        # Get all the language related stuffs
        dictionary = indic_en.get_dictionary_for(src_lang)
        vowels = indic_en.get_vowels_for(src_lang)
        vowel_signs = indic_en.get_vowel_signs_for(src_lang)
        virama = indic_en.get_virama_for(src_lang)
        anuswara = indic_en.get_anuswara_for(src_lang)

        word_length = len(word)
        index = 0
        tx_string = ""
        while index < word_length:

            # If current charachter is a punctuation symbol
            # skip it.
            # Added to avoid getting extra 'a' to the begining
            # of word next to punctuation symbol
            #

            if word[index] in string.punctuation:
                tx_string += word[index]
                index += 1
                continue

            # Virama = conjucter
            if word[index] == virama:
                index += 1
                continue

            # Get english equivalaent of the charachter.
            try:
                tx_string += dictionary[word[index]]
            except KeyError:
                # If charachter isn't present in the dict
                # just append the charachter to string
                # This case is now handled by punctuation checking
                tx_string += word[index]

            if index + 1 < word_length and not word[index + 1] in vowel_signs \
                    and word[index + 1] in dictionary \
                    and not word[index] in vowels \
                    and not word[index] in vowel_signs:
                tx_string += 'a'

            if index + 1 == word_length and not word[index] in vowel_signs \
                    and word[index] in dictionary:
                tx_string += 'a'

            #handle am sign
            if index + 1 < word_length and word[index + 1] == anuswara \
                    and not word[index] in vowel_signs:
                tx_string += 'a'
            index += 1
        return tx_string

    def transliterate(self, text, target_lang_code):
        """
        :param text: The text to be transliterated.
        :type text: str.
        :param target_lang_code: The language into which word has to be transliterated.
        :type target_lang_code: str.
        :returns: the transliterated text.

        The transliteration functioon which can transliterate text to the
        supported target languages.

        """
        tx_str = ""
        lines = text.split("\n")
        for line in lines:
            words = line.split(" ")
            for word in words:
                if(word.strip() > ""):
                    try:
                        src_lang_code = detect_lang(word)[word]
                    except:
                        tx_str = tx_str + " " + word
                        continue  # FIXME

                    if target_lang_code == "ISO15919":
                        tx_str = (tx_str
                                  + self.transliterate_iso15919(word,
                                                                src_lang_code)
                                  + " ")
                        continue

                    if target_lang_code == "IPA":
                        tx_str = (tx_str
                                  + self.transliterate_ipa(word,
                                                           src_lang_code)
                                  + " ")
                        continue

                    if src_lang_code == "en_US":
                        tx_str = (tx_str
                                  + self.transliterate_en_xx(word,
                                                             target_lang_code)
                                  + " ")
                        continue

                    if target_lang_code == "en_US" or \
                            target_lang_code == "en_IN":
                        tx_str = (tx_str
                                  + self.transliterate_xx_en(word,
                                                             src_lang_code)
                                  + " ")
                        continue

                    tx_str += self.transliterate_indic_indic(word,
                                                             src_lang_code,
                                                             target_lang_code)

                    if len(line) > 1:
                        tx_str += " "

                else:
                    tx_str = tx_str + word
            if len(lines) > 1:
                tx_str += "\n"
        # Language specific fixes
        if target_lang_code == "ml_IN":
            tx_str = self._malayalam_fixes(tx_str)
        return tx_str

    def getOffset(self, src, target):
        src_id = 0
        target_id = 0
        try:
            src_id = lang_bases[src]
            target_id = lang_bases[target]
            return (target_id - src_id)
        except:
            return 0

    def get_module_name(self):
        """
        returns module name
        """
        return "Transliterator"

    def get_info(self):
        """
        Returns module info
        """
        return "Transliterate the text between any Indian Language"