Example #1
0
 def __init__(self):
     self.input_filename = ""
     self.output_filename = ""
     self.mapping_filename = ""
     self.rulesDict = None
     self.pdf = 0
     self.normalizer = Normalizer()
Example #2
0
 def __init__(self):
     self.cmu = CMUDict()
     self.normalizer = Normalizer()
Example #3
0
class Transliterator:
    """
    Transliteration class, instantiate this to get access  to the transliteration methods
    """
    def __init__(self):
        self.cmu = CMUDict()
        self.normalizer = Normalizer()

    def transliterate_en_ml(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Malayalam with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "ml_IN")

    def transliterate_en_kn(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Kannada with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "kn_IN")

    def transliterate_en_hi(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Hindi with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "hi_IN")

    def transliterate_en_xx(self, word, target_lang):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :param target_lang: The language into which word has to be transliterated.
        :type target_lang: str.
        :returns: the translated word.

        Transliterate English to any Indian Language.
        """
        if target_lang == "en_IN" or target_lang == "en_US":
            return word
        if target_lang == "kn_IN":
            tx_str = self.transliterate_en_kn(word)
            return tx_str
        elif target_lang == "hi_IN":
            tx_str = self.transliterate_en_hi(word)
            return tx_str
        else:
            tx_str = self.transliterate_en_ml(word)

        if target_lang == "ml_IN":
            return tx_str
        #chain it through indic indic transliteratioin
        #first remove malayalam specific zwj
        tx_str = tx_str.replace(u'‍', '')  # remove instances of zwnj
        if tx_str[-1:] == u'്' and \
           (target_lang == "hi_IN"
            or target_lang == "gu_IN"
            or target_lang == "bn_IN"):
            tx_str = tx_str[:-(len(u'്'))]
        # remove the last virama'
        return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang)

    def transliterate_xx_en(self, word, src_lang):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :param src_lang: The language of the word.
        :type src_lang: str.
        :returns: the translated word.

        Transliterate Indian Language to English.
        """
        if src_lang == "en_IN" or src_lang == "en_US":
            return word

        # TODO: the function is generic now so no need of testing the lanuguage
        # but since the indic_en contains only for kn_IN and ml_IN we need this
        # check.
        # Add all indic language to indic_en
        # remplace this block with single call to indic_en function
        if src_lang == "kn_IN":
            return self.transliterate_indic_en(word, src_lang)
        if not src_lang == "ml_IN":
            word = self.transliterate_indic_indic(word, src_lang, "ml_IN")

        return self.transliterate_indic_en(word, "ml_IN")

    def transliterate_iso15919(self, word, src_language):
        tx_str = ""
        index = 0
        word_length = len(word)
        for chr in word:
            index += 1
            offset = ord(chr) - lang_bases[src_language]
            #76 is the virama offset for all indian languages from its base
            if offset >= 61 and offset <= 76:
                tx_str = tx_str[:-1]  # remove the last 'a'
            if offset > 0 and offset <= 128:
                tx_str = tx_str + charmap["ISO15919"][offset]
            #delete the inherent 'a' at the end of the word from hindi
            if tx_str[-1:] == 'a' and (src_language == "hi_IN" or src_language
                                       == "gu_IN" or src_language == "bn_IN"):
                if word_length == index and word_length > 1:  # if last letter
                    tx_str = tx_str[:-1]  # remove the last 'a'
        return tx_str.decode("utf-8")

    def transliterate_ipa(self, word, src_language):
        """
        Transliterate the given word in src_language to
        IPA - International Phonetical Alphabet notation.

        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.
        :param src_lang: The language of the word.
        :type src_lang: str.
        """
        tx_str = ""
        index = 0
        word_length = len(word)
        for chr in word:
            index += 1
            if ord(chr) < 255:  # ASCII characters + English
                tx_str += chr
                continue
            offset = ord(chr) - lang_bases[src_language]
            #76 is the virama offset for all indian languages from its base
            if offset >= 61 and offset <= 76:
                tx_str = tx_str[:-(len('ə'))]  # remove the last 'ə'
            if offset > 0 and offset <= 128:
                tx_str = tx_str + charmap_transphon["IPA"][offset]
            #delete the inherent 'a' at the end of the word from hindi
            if tx_str[-1:] == 'ə' and \
               (src_language == "hi_IN"
                or src_language == "gu_IN"
                or src_language == "bn_IN") and \
               (word_length == index
                and word_length > 1):
                tx_str = tx_str[:-(len('ə'))]
            # if last letter
            # remove the last 'a'
        return tx_str.decode("utf-8")

    def _malayalam_fixes(self, text):
        try:
            text = text.replace(u"മ് ", u"ം ")
            text = text.replace(u"മ്,", u"ം,")
            text = text.replace(u"മ്.", u"ം.")
            text = text.replace(u"മ്)", u"ം)")
            text = text.replace(u"ഩ", u"ന")
            text = text.replace(u"൤", u".")  # danda by fullstop
        except:
            pass
        return text

    def transliterate_indic_indic(self, word, src_lang, target_lang):
        """
        Transliterate from an Indian languge word
        to another indian language word

        :param word: The word to be transliterated.
        :type word: str.
        :param src_lang: The language of the word.
        :type src_lang: str.
        :param target_lang: The language into which word has to be transliterated.
        :type target_lang: str.
        :returns: the translated word.

        """
        index = 0
        tx_str = ""
        word = self.normalizer.normalize(word)
        if src_lang == "ml_IN" and target_lang != "ml_IN":
            word = word.replace(u"\u200C", u"")
            word = word.replace(u"\u200D", u"")
            #replace all samvruthokaram by u vowels
            word = word.replace(u"ു്", u"")

        for character in word:
            index += 1
            if character in string.punctuation or (ord(character) <= 2304
                                                   and ord(character) >= 3071):
                tx_str = tx_str + character
                continue
            offset = ord(character) + self.getOffset(src_lang, target_lang)
            if (offset > 0):
                tx_str = tx_str + chr(offset)
            #schwa deletion
            baseoffset = offset - lang_bases[target_lang]
            #76 : virama
            if (index == len(word) and baseoffset == 76
                    and (target_lang == "hi_IN" or target_lang == "gu_IN"
                         or target_lang == "pa_IN" or target_lang == "bn_IN")):
                #TODO Add more languages having schwa deletion characteristic
                tx_str = tx_str[:-(len(character))]  # remove the last 'a'

            if target_lang == "ml_IN" and src_lang == "ta_IN":
                tx_str = tx_str.replace(u"ഩ", u"ന")

            if target_lang == "ta_IN":
                tx_str = tx_str.replace(u'\u0B96', u"க")
                tx_str = tx_str.replace(u'\u0B97', u"க")
                tx_str = tx_str.replace(u'\u0B98', u"க")
                tx_str = tx_str.replace(u'\u0B9B', u"ச")
                tx_str = tx_str.replace(u'\u0B9D', u"ச")
                tx_str = tx_str.replace(u'\u0BA0', u"ட")
                tx_str = tx_str.replace(u'\u0BA1', u"ட")
                tx_str = tx_str.replace(u'\u0BA2', u"ட")
                tx_str = tx_str.replace(u'\u0BA5', u"த")
                tx_str = tx_str.replace(u'\u0BA6', u"த")
                tx_str = tx_str.replace(u'\u0BA7', u"த")
                tx_str = tx_str.replace(u'\u0BAB', u"ப")
                tx_str = tx_str.replace(u'\u0BAC', u"ப")
                tx_str = tx_str.replace(u'\u0BAD', u"ப")
                tx_str = tx_str.replace(u'\u0BC3', u"ிரு")
                tx_str = tx_str.replace(u'ஂ', u'ம்')
        #If target is malayalam, we need to add the virama
        if ((target_lang == "ml_IN")
                and (src_lang == "hi_IN" or src_lang == "gu_IN"
                     or src_lang == "pa_IN" or src_lang == "bn_IN")
                and tx_str[-1].isalpha()):
            tx_str = tx_str + u"്"
        return tx_str

    def transliterate_indic_en(self, word, src_lang):
        """
        Arguments:
        - `self`:
        - `word`: Word to be transliterated (sentence)
        - `src_lang`: Language from which we need to transilterate
        """

        # Get all the language related stuffs
        dictionary = indic_en.get_dictionary_for(src_lang)
        vowels = indic_en.get_vowels_for(src_lang)
        vowel_signs = indic_en.get_vowel_signs_for(src_lang)
        virama = indic_en.get_virama_for(src_lang)
        anuswara = indic_en.get_anuswara_for(src_lang)

        word_length = len(word)
        index = 0
        tx_string = ""
        while index < word_length:

            # If current charachter is a punctuation symbol
            # skip it.
            # Added to avoid getting extra 'a' to the begining
            # of word next to punctuation symbol
            #

            if word[index] in string.punctuation:
                tx_string += word[index]
                index += 1
                continue

            # Virama = conjucter
            if word[index] == virama:
                index += 1
                continue

            # Get english equivalaent of the charachter.
            try:
                tx_string += dictionary[word[index]]
            except KeyError:
                # If charachter isn't present in the dict
                # just append the charachter to string
                # This case is now handled by punctuation checking
                tx_string += word[index]

            if index + 1 < word_length and not word[index + 1] in vowel_signs \
                    and word[index + 1] in dictionary \
                    and not word[index] in vowels \
                    and not word[index] in vowel_signs:
                tx_string += 'a'

            if index + 1 == word_length and not word[index] in vowel_signs \
                    and word[index] in dictionary:
                tx_string += 'a'

            #handle am sign
            if index + 1 < word_length and word[index + 1] == anuswara \
                    and not word[index] in vowel_signs:
                tx_string += 'a'
            index += 1
        return tx_string

    def transliterate(self, text, target_lang_code):
        """
        :param text: The text to be transliterated.
        :type text: str.
        :param target_lang_code: The language into which word has to be transliterated.
        :type target_lang_code: str.
        :returns: the transliterated text.

        The transliteration functioon which can transliterate text to the
        supported target languages.

        """
        tx_str = ""
        lines = text.split("\n")
        for line in lines:
            words = line.split(" ")
            for word in words:
                if (word.strip() > ""):
                    try:
                        src_lang_code = detect_lang(word)[word]
                    except:
                        tx_str = tx_str + " " + word
                        continue  # FIXME

                    if target_lang_code == "ISO15919":
                        tx_str = (tx_str + self.transliterate_iso15919(
                            word, src_lang_code) + " ")
                        continue

                    if target_lang_code == "IPA":
                        tx_str = (tx_str +
                                  self.transliterate_ipa(word, src_lang_code) +
                                  " ")
                        continue

                    if src_lang_code == "en_US":
                        tx_str = (tx_str + self.transliterate_en_xx(
                            word, target_lang_code) + " ")
                        continue

                    if target_lang_code == "en_US" or \
                            target_lang_code == "en_IN":
                        tx_str = (tx_str + self.transliterate_xx_en(
                            word, src_lang_code) + " ")
                        continue

                    tx_str += self.transliterate_indic_indic(
                        word, src_lang_code, target_lang_code)

                    if len(line) > 1:
                        tx_str += " "

                else:
                    tx_str = tx_str + word
            if len(lines) > 1:
                tx_str += "\n"
        # Language specific fixes
        if target_lang_code == "ml_IN":
            tx_str = self._malayalam_fixes(tx_str)
        return tx_str

    def getOffset(self, src, target):
        src_id = 0
        target_id = 0
        try:
            src_id = lang_bases[src]
            target_id = lang_bases[target]
            return (target_id - src_id)
        except:
            return 0

    def get_module_name(self):
        """
        returns module name
        """
        return "Transliterator"

    def get_info(self):
        """
        Returns module info
        """
        return "Transliterate the text between any Indian Language"
Example #4
0
class Payyans():
    def __init__(self):
        self.input_filename = ""
        self.output_filename = ""
        self.mapping_filename = ""
        self.rulesDict = None
        self.pdf = 0
        self.normalizer = Normalizer()

    def Unicode2ASCII(self, unicode_text, font):
        unicode_text = self.normalizer.normalize(unicode_text)
        index = 0
        ascii_text = ""
        self.direction = "u2a"
        self.mapping_filename = os.path.join(os.path.dirname(__file__), 'maps',
                                             font + ".map")
        self.rulesDict = self.LoadRules()
        while index < len(unicode_text):
            '''കൂട്ടക്ഷരങ്ങള്‍ക്കൊരു കുറുക്കുവഴി'''
            for charNo in [3, 2, 1]:
                letter = unicode_text[index:index + charNo]
                if letter in self.rulesDict:
                    ascii_letter = self.rulesDict[letter]
                    letter = letter.encode('utf-8')
                    '''
                    കിട്ടിയ അക്ഷരങ്ങളുടെ അപ്പുറത്തും ഇപ്പുറത്തും
                    സ്വരചിഹ്നങ്ങള്‍ ഫിറ്റ് ചെയ്യാനുള്ള ബദ്ധപ്പാട്
                    '''
                    if letter == 'ൈ':  # പിറകില്‍ രണ്ടു സാധനം പിടിപ്പിക്കുക
                        ascii_text = ascii_text[:-1] + ascii_letter + \
                            ascii_text[-1:]
                    elif (letter == 'ോ') | (letter == 'ൊ') \
                            | (letter == 'ൌ'):  # മുമ്പിലൊന്നും പിറകിലൊന്നും
                        ascii_text = ascii_text[:-1] + ascii_letter[0] + \
                            ascii_text[-1:] + ascii_letter[1]
                    elif (letter == 'െ') | (letter == 'േ') | \
                            (letter == '്ര'):  # പിറകിലൊന്നുമാത്രം
                        ascii_text = ascii_text[:-1] + ascii_letter + \
                            ascii_text[-1:]
                    else:
                        ascii_text = ascii_text + ascii_letter
                    index = index + charNo
                    break
                else:
                    if (charNo == 1):
                        index = index + 1
                        ascii_text = ascii_text + letter
                        break
                    '''നോക്കിയിട്ടു കിട്ടുന്നില്ല ബായി'''
                    ascii_letter = letter
                    # ascii_text = ascii_text + ascii_letter
                    # index = index+1

        return ascii_text

    def ASCII2Unicode(self, ascii_text, font):
        ascii_text = self.normalizer.normalize(ascii_text)
        index = 0
        post_index = 0
        prebase_letter = ""
        postbase_letter = ""  # "‌‌്യ", "്വ"
        unicode_text = ""
        next_ucode_letter = ""
        self.direction = "a2u"
        self.mapping_filename = os.path.join(os.path.dirname(__file__), 'maps',
                                             font + ".map")
        self.rulesDict = self.LoadRules()
        while index < len(ascii_text):
            for charNo in [2, 1]:
                letter = ascii_text[index:index + charNo]
                if letter in self.rulesDict:
                    unicode_letter = self.rulesDict[letter]
                    if (self.isPrebase(unicode_letter)):  # സ്വരചിഹ്നമാണോ?
                        prebase_letter = unicode_letter
                    else:  # സ്വരചിഹ്നമല്ല
                        '''
                        എങ്കില്‍ വ്യഞ്ജനത്തിനു ശേഷം
                        പോസ്റ്റ്-ബേസ് ഉണ്ടോ എന്നു നോക്കൂ
                        '''
                        post_index = index + charNo
                        if post_index < len(ascii_text):
                            letter = ascii_text[post_index]
                            if letter in self.rulesDict:
                                next_ucode_letter = self.rulesDict[letter]
                                if self.isPostbase(next_ucode_letter):
                                    postbase_letter = next_ucode_letter
                                    index = index + 1
                        if ((unicode_letter.encode('utf-8') == "എ") |
                            (unicode_letter.encode('utf-8') == "ഒ")):
                            unicode_text = unicode_text + postbase_letter + \
                                self.getVowelSign(prebase_letter,
                                                  unicode_letter)
                        else:
                            unicode_text = unicode_text + unicode_letter + \
                                postbase_letter + prebase_letter
                        prebase_letter = ""
                        postbase_letter = ""
                    index = index + charNo
                    break
                else:
                    if charNo == 1:
                        unicode_text = unicode_text + letter
                        index = index + 1
                        break
                    unicode_letter = letter
        return unicode_text  # മതം മാറ്റി തിരിച്ചു കൊടുക്ക്വാ !

    def getVowelSign(self, vowel_letter, vowel_sign_letter):
        vowel = vowel_letter.encode('utf-8')
        vowel_sign = vowel_sign_letter.encode('utf-8')
        if vowel == "എ":
            if vowel_sign == "െ":
                return "ഐ"
        if vowel == "ഒ":
            if vowel_sign == "ാ":
                return "ഓ"
            if vowel_sign == "ൗ":
                return "ഔ"
        return (vowel_letter + vowel_sign_letter)

    def isPrebase(self, letter):
        '''
         ഇതെന്തിനാന്നു ചോദിച്ചാ, ഈ അക്ഷരങ്ങളുടെ ഇടതു വശത്തെഴുതുന്ന
         സ്വര ചിഹ്നങ്ങളുണ്ടല്ലോ? അവ ആസ്കി തരികിടയില്‍ എഴുതുന്നതു് ഇടതു വശത്തു
         തന്നെയാ. യൂണിക്കോഡില്‍ അക്ഷരത്തിനു ശേഷവും അപ്പൊ ആ വക സംഭവങ്ങളെ
         തിരിച്ചറിയാനാണു് ഈ സംഭവം.
        "തരികിട തരികിടോ ധീംതരികിട" (തരികിട തരികിടയാല്‍)
         എന്നു പയ്യന്റെ ഗുരു പയ്യഗുരു പയ്യെ മൊഴിഞ്ഞിട്ടുണ്ടു്.
        '''
        unicode_letter = letter.encode('utf-8')
        if ((unicode_letter == "േ") | (unicode_letter == "ൈ") |
            (unicode_letter == "ൊ") | (unicode_letter == "ോ") |
            (unicode_letter == "ൌ") | (unicode_letter == "്ര") |
            (unicode_letter == "െ")):
            return True  # "ഇതു സത്യം... അ...സത്യം.... അസത്യം...!"
        else:
            return False

    def isPostbase(self, letter):
        '''
        "ക്യ" എന്നതിലെ "്യ", "ക്വ" എന്നതിലെ "്വ" എന്നിവ പോസ്റ്റ്-ബേസ് ആണ്.
        "ത്യേ" എന്നത് ആസ്കിയില്‍ "ഏ+ത+്യ" എന്നാണ് എഴുതുന്നത്.
        അപ്പോള്‍ വ്യഞ്ജനം കഴിഞ്ഞ് പോസ്റ്റ്-ബേസ് ഉണ്ടെങ്കില്‍
        വ്യഞ്ജനം+പോസ്റ്റ്-ബേസ് കഴിഞ്ഞേ പ്രീ-ബേസ് ചേര്‍ക്കാവൂ!
        ഹൊ, പയ്യന്‍ പാണിനീശിഷ്യനാണ്!!
        '''
        unicode_letter = letter.encode('utf-8')
        if ((unicode_letter == "്യ") | (unicode_letter == "്വ")):
            return True
        else:
            return False

    def LoadRules(self):
        '''
        ഈ സംഭവമാണു് മാപ്പിങ്ങ് ഫയല്‍ എടുത്തു് വായിച്ചു പഠിക്കുന്നതു്.
        '''
        # if(self.rulesDict):
        #    return self.rulesDict
        rules_dict = dict()
        line = []
        line_number = 0
        rules_file = codecs.open(self.mapping_filename,
                                 encoding='utf-8',
                                 errors='ignore')
        while True:
            '''
            ലൈന്‍ നമ്പര്‍ , മാപ്പിങ്ങ് ഫയലില്‍ തെറ്റുണ്ടെങ്കില്‍
            പറയാന്‍ ആവശ്യാണു്
            '''
            line_number = line_number + 1
            original_text = rules_file.readline()
            try:
                text = unicode(original_text)  # noqa: F821
            except BaseException:
                text = original_text
            if text == "":
                break
            '''കമന്റടിച്ചേ മത്യാവൂന്നു വെച്ചാ ആവാം. ഒട്ടും മുഷിയില്ല്യ'''
            if text[0] == '#':
                continue
                '''
                കമന്റടി പതിവുപോലെ മൈന്റ് ചെയ്യണ്ട ഒന്നും കണ്ടില്യാ
                കേട്ടില്യാന്നു വെച്ചു നടന്നോളൂ(മനസ്സില്‍ ചിരിച്ചോളൂ)
                '''
            line = text.strip()
            if (line == ""):
                continue
                '''ലൈനൊന്നും ല്യാ, മോശം.. ങും പോട്ടെ. വേറെ ലൈന്‍ പിടിക്കാം'''
            if (len(line.split("=")) != 2):
                '''എന്തോ പ്രശ്നണ്ടു്. ന്നാ അതങ്ങടു തുറന്നു പറഞ്ഞേക്കാം'''
                print(
                    "Error: Syntax Error in the Ascii to Unicode Map "
                    "in line number ", line_number)
                print("Line: " + text)
                '''പരിപാടി നിര്‍ത്താം '''
                return 2  # Error - Syntax error in Mapping file
            '''ഇടതന്‍'''
            lhs = line.split("=")[0]
            '''വലതന്‍'''
            rhs = line.split("=")[1]
            '''ഇതിനിടക്കിനി മൂന്നാമനു സ്കോപ്പിണ്ടോ? '''
            '''മറക്കാതെ ഇരിക്കട്ടെ. ആവശ്യം വരും '''
            lhs = lhs.strip()
            rhs = rhs.strip()
            if self.direction == 'a2u':
                rules_dict[lhs] = rhs
            else:
                rules_dict[rhs] = lhs
        return rules_dict

    def get_module_name(self):
        return "Payyans Unicode-ASCII Converter"

    def get_info(self):
        return "ASCII data - Unicode Convertor based on font maps"
Example #5
0
from flask import Flask, jsonify, request, render_template, url_for
from pymongo import MongoClient
from libindic.normalizer import Normalizer

import sys, os, json
import pandas as pd

app = Flask(__name__)
db = MongoClient()
olam = db['olam']['olam-enml']
APP_ROOT = os.path.dirname(__file__)

nm = Normalizer()


def import_csv_to_db():
    file_res = os.path.join(APP_ROOT, 'olam-enml.csv')
    print file_res
    csv_data = pd.read_csv(file_res, sep='\t')
    json_data = json.loads(csv_data.to_json(orient='records'))
    olam.remove()
    olam.insert(json_data)


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/search', methods=['GET'])
def search():
Example #6
0
 def __init__(self):
     self.cmu = CMUDict()
     self.normalizer = Normalizer()
Example #7
0
class Transliterator:
    """
    Transliteration class, instantiate this to get access  to the transliteration methods
    """
    def __init__(self):
        self.cmu = CMUDict()
        self.normalizer = Normalizer()

    def transliterate_en_ml(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Malayalam with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "ml_IN")

    def transliterate_en_kn(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Kannada with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "kn_IN")

    def transliterate_en_hi(self, word):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.

        Transliterate English to Hindi with the help of
        CMU pronuciation dictionary
        """
        return self.cmu.pronunciation(word, "hi_IN")

    def transliterate_en_xx(self, word, target_lang):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :param target_lang: The language into which word has to be transliterated.
        :type target_lang: str.
        :returns: the translated word.

        Transliterate English to any Indian Language.
        """
        if target_lang == "en_IN" or target_lang == "en_US":
            return word
        if target_lang == "kn_IN":
            tx_str = self.transliterate_en_kn(word)
            return tx_str
        elif target_lang == "hi_IN":
            tx_str = self.transliterate_en_hi(word)
            return tx_str
        else:
            tx_str = self.transliterate_en_ml(word)

        if target_lang == "ml_IN":
            return tx_str
        #chain it through indic indic transliteratioin
        #first remove malayalam specific zwj
        tx_str = tx_str.replace(u'‍', '')  # remove instances of zwnj
        if tx_str[-1:] == u'്' and \
           (target_lang == "hi_IN"
            or target_lang == "gu_IN"
            or target_lang == "bn_IN"): tx_str = tx_str[:-(len(u'്'))]
        # remove the last virama'
        return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang)

    def transliterate_xx_en(self, word, src_lang):
        """
        :param word: The word to be transliterated.
        :type word: str.
        :param src_lang: The language of the word.
        :type src_lang: str.
        :returns: the translated word.

        Transliterate Indian Language to English.
        """
        if src_lang == "en_IN" or src_lang == "en_US":
            return word

        # TODO: the function is generic now so no need of testing the lanuguage
        # but since the indic_en contains only for kn_IN and ml_IN we need this
        # check.
        # Add all indic language to indic_en
        # remplace this block with single call to indic_en function
        if src_lang == "kn_IN":
            return self.transliterate_indic_en(word, src_lang)
        if not src_lang == "ml_IN":
            word = self.transliterate_indic_indic(word, src_lang, "ml_IN")

        return self.transliterate_indic_en(word, "ml_IN")

    def transliterate_iso15919(self, word, src_language):
        tx_str = ""
        index = 0
        word_length = len(word)
        for chr in word:
            index += 1
            offset = ord(chr) - lang_bases[src_language]
            #76 is the virama offset for all indian languages from its base
            if offset >= 61 and offset <= 76:
                tx_str = tx_str[:-1]  # remove the last 'a'
            if offset > 0 and offset <= 128:
                tx_str = tx_str + charmap["ISO15919"][offset]
            #delete the inherent 'a' at the end of the word from hindi
            if tx_str[-1:] == 'a' and (src_language == "hi_IN"
                                       or src_language == "gu_IN"
                                       or src_language == "bn_IN"):
                if word_length == index and word_length > 1:  # if last letter
                    tx_str = tx_str[:-1]  # remove the last 'a'
        return tx_str .decode("utf-8")

    def transliterate_ipa(self, word, src_language):
        """
        Transliterate the given word in src_language to
        IPA - International Phonetical Alphabet notation.

        :param word: The word to be transliterated.
        :type word: str.
        :returns: the translated word.
        :param src_lang: The language of the word.
        :type src_lang: str.
        """
        tx_str = ""
        index = 0
        word_length = len(word)
        for chr in word:
            index += 1
            if ord(chr) < 255:  # ASCII characters + English
                tx_str += chr
                continue
            offset = ord(chr) - lang_bases[src_language]
            #76 is the virama offset for all indian languages from its base
            if offset >= 61 and offset <= 76:
                tx_str = tx_str[:-(len('ə'))]  # remove the last 'ə'
            if offset > 0 and offset <= 128:
                tx_str = tx_str + charmap_transphon["IPA"][offset]
            #delete the inherent 'a' at the end of the word from hindi
            if tx_str[-1:] == 'ə' and \
               (src_language == "hi_IN"
                or src_language == "gu_IN"
                or src_language == "bn_IN") and \
               (word_length == index
                and word_length > 1): tx_str = tx_str[:-(len('ə'))]
            # if last letter
            # remove the last 'a'
        return tx_str.decode("utf-8")

    def _malayalam_fixes(self, text):
        try:
            text = text.replace(u"മ് ", u"ം ")
            text = text.replace(u"മ്,", u"ം,")
            text = text.replace(u"മ്.", u"ം.")
            text = text.replace(u"മ്)", u"ം)")
            text = text.replace(u"ഩ", u"ന")
            text = text.replace(u"൤", u".")  # danda by fullstop
        except:
            pass
        return text

    def transliterate_indic_indic(self, word, src_lang, target_lang):
        """
        Transliterate from an Indian languge word
        to another indian language word

        :param word: The word to be transliterated.
        :type word: str.
        :param src_lang: The language of the word.
        :type src_lang: str.
        :param target_lang: The language into which word has to be transliterated.
        :type target_lang: str.
        :returns: the translated word.

        """
        index = 0
        tx_str = ""
        word = self.normalizer.normalize(word)
        if src_lang == "ml_IN" and target_lang != "ml_IN":
            word = word.replace(u"\u200C", u"")
            word = word.replace(u"\u200D", u"")
            #replace all samvruthokaram by u vowels
            word = word.replace(u"ു്", u"")

        for character in word:
            index += 1
            if character in string.punctuation or (ord(character) <= 2304
                                             and ord(character) >= 3071):
                tx_str = tx_str + character
                continue
            offset = ord(character) + self.getOffset(src_lang, target_lang)
            if(offset > 0):
                tx_str = tx_str + chr(offset)
            #schwa deletion
            baseoffset = offset - lang_bases[target_lang]
            #76 : virama
            if (index == len(word) and baseoffset == 76
                    and (target_lang == "hi_IN"
                         or target_lang == "gu_IN"
                         or target_lang == "pa_IN"
                         or target_lang == "bn_IN")):
                #TODO Add more languages having schwa deletion characteristic
                tx_str = tx_str[:-(len(character))]  # remove the last 'a'

            if target_lang == "ml_IN" and src_lang == "ta_IN":
                tx_str = tx_str.replace(u"ഩ", u"ന")

            if target_lang == "ta_IN":
                tx_str = tx_str.replace(u'\u0B96', u"க")
                tx_str = tx_str.replace(u'\u0B97', u"க")
                tx_str = tx_str.replace(u'\u0B98', u"க")
                tx_str = tx_str.replace(u'\u0B9B', u"ச")
                tx_str = tx_str.replace(u'\u0B9D', u"ச")
                tx_str = tx_str.replace(u'\u0BA0', u"ட")
                tx_str = tx_str.replace(u'\u0BA1', u"ட")
                tx_str = tx_str.replace(u'\u0BA2', u"ட")
                tx_str = tx_str.replace(u'\u0BA5', u"த")
                tx_str = tx_str.replace(u'\u0BA6', u"த")
                tx_str = tx_str.replace(u'\u0BA7', u"த")
                tx_str = tx_str.replace(u'\u0BAB', u"ப")
                tx_str = tx_str.replace(u'\u0BAC', u"ப")
                tx_str = tx_str.replace(u'\u0BAD', u"ப")
                tx_str = tx_str.replace(u'\u0BC3', u"ிரு")
                tx_str = tx_str.replace(u'ஂ', u'ம்')
        #If target is malayalam, we need to add the virama
        if ((target_lang == "ml_IN") and
            (src_lang == "hi_IN"
             or src_lang == "gu_IN"
             or src_lang == "pa_IN"
             or src_lang == "bn_IN") and tx_str[-1].isalpha()):
            tx_str = tx_str + u"്"
        return tx_str

    def transliterate_indic_en(self, word, src_lang):
        """
        Arguments:
        - `self`:
        - `word`: Word to be transliterated (sentence)
        - `src_lang`: Language from which we need to transilterate
        """

        # Get all the language related stuffs
        dictionary = indic_en.get_dictionary_for(src_lang)
        vowels = indic_en.get_vowels_for(src_lang)
        vowel_signs = indic_en.get_vowel_signs_for(src_lang)
        virama = indic_en.get_virama_for(src_lang)
        anuswara = indic_en.get_anuswara_for(src_lang)

        word_length = len(word)
        index = 0
        tx_string = ""
        while index < word_length:

            # If current charachter is a punctuation symbol
            # skip it.
            # Added to avoid getting extra 'a' to the begining
            # of word next to punctuation symbol
            #

            if word[index] in string.punctuation:
                tx_string += word[index]
                index += 1
                continue

            # Virama = conjucter
            if word[index] == virama:
                index += 1
                continue

            # Get english equivalaent of the charachter.
            try:
                tx_string += dictionary[word[index]]
            except KeyError:
                # If charachter isn't present in the dict
                # just append the charachter to string
                # This case is now handled by punctuation checking
                tx_string += word[index]

            if index + 1 < word_length and not word[index + 1] in vowel_signs \
                    and word[index + 1] in dictionary \
                    and not word[index] in vowels \
                    and not word[index] in vowel_signs:
                tx_string += 'a'

            if index + 1 == word_length and not word[index] in vowel_signs \
                    and word[index] in dictionary:
                tx_string += 'a'

            #handle am sign
            if index + 1 < word_length and word[index + 1] == anuswara \
                    and not word[index] in vowel_signs:
                tx_string += 'a'
            index += 1
        return tx_string

    def transliterate(self, text, target_lang_code):
        """
        :param text: The text to be transliterated.
        :type text: str.
        :param target_lang_code: The language into which word has to be transliterated.
        :type target_lang_code: str.
        :returns: the transliterated text.

        The transliteration functioon which can transliterate text to the
        supported target languages.

        """
        tx_str = ""
        lines = text.split("\n")
        for line in lines:
            words = line.split(" ")
            for word in words:
                if(word.strip() > ""):
                    try:
                        src_lang_code = detect_lang(word)[word]
                    except:
                        tx_str = tx_str + " " + word
                        continue  # FIXME

                    if target_lang_code == "ISO15919":
                        tx_str = (tx_str
                                  + self.transliterate_iso15919(word,
                                                                src_lang_code)
                                  + " ")
                        continue

                    if target_lang_code == "IPA":
                        tx_str = (tx_str
                                  + self.transliterate_ipa(word,
                                                           src_lang_code)
                                  + " ")
                        continue

                    if src_lang_code == "en_US":
                        tx_str = (tx_str
                                  + self.transliterate_en_xx(word,
                                                             target_lang_code)
                                  + " ")
                        continue

                    if target_lang_code == "en_US" or \
                            target_lang_code == "en_IN":
                        tx_str = (tx_str
                                  + self.transliterate_xx_en(word,
                                                             src_lang_code)
                                  + " ")
                        continue

                    tx_str += self.transliterate_indic_indic(word,
                                                             src_lang_code,
                                                             target_lang_code)

                    if len(line) > 1:
                        tx_str += " "

                else:
                    tx_str = tx_str + word
            if len(lines) > 1:
                tx_str += "\n"
        # Language specific fixes
        if target_lang_code == "ml_IN":
            tx_str = self._malayalam_fixes(tx_str)
        return tx_str

    def getOffset(self, src, target):
        src_id = 0
        target_id = 0
        try:
            src_id = lang_bases[src]
            target_id = lang_bases[target]
            return (target_id - src_id)
        except:
            return 0

    def get_module_name(self):
        """
        returns module name
        """
        return "Transliterator"

    def get_info(self):
        """
        Returns module info
        """
        return "Transliterate the text between any Indian Language"