def test_langdetect():
    '''TEST: Language Detection'''
    assert detect_lang(u'ನಮಸ್ಕಾರ')[u'ನಮಸ್ಕಾರ'] == 'kn_IN'
    assert detect_lang(u'बॆंगलूरु')[u'बॆंगलूरु'] == 'hi_IN'
    assert detect_lang(u'বাংগ্লা')[u'বাংগ্লা'] == 'bn_IN'
    assert detect_lang(u'മലയാളം')[u'മലയാളം'] == 'ml_IN'
    assert detect_lang(u'தமிள்')[u'தமிள்'] == 'ta_IN'
    assert detect_lang(u'తెలుగు')[u'తెలుగు'] == 'te_IN'
    assert detect_lang(u'଒ରିଯା')[u'଒ରିଯା'] == 'or_IN'
    assert detect_lang(u'ਪਂਜਾਬਿ')[u'ਪਂਜਾਬਿ'] == 'pa_IN'
    assert detect_lang(u'ગુજરાતિ')[u'ગુજરાતિ'] == 'gu_IN'
    assert detect_lang("English")["English"] == 'en_US'
Exemple #2
0
    def transliterate(self, text, target_lang_code):
        """
        :param text: The text to be transliterated.
        :type text: str.
        :param target_lang_code: The language into which word has to be transliterated.
        :type target_lang_code: str.
        :returns: the transliterated text.

        The transliteration functioon which can transliterate text to the
        supported target languages.

        """
        tx_str = ""
        lines = text.split("\n")
        for line in lines:
            words = line.split(" ")
            for word in words:
                if (word.strip() > ""):
                    try:
                        src_lang_code = detect_lang(word)[word]
                    except:
                        tx_str = tx_str + " " + word
                        continue  # FIXME

                    if target_lang_code == "ISO15919":
                        tx_str = (tx_str + self.transliterate_iso15919(
                            word, src_lang_code) + " ")
                        continue

                    if target_lang_code == "IPA":
                        tx_str = (tx_str +
                                  self.transliterate_ipa(word, src_lang_code) +
                                  " ")
                        continue

                    if src_lang_code == "en_US":
                        tx_str = (tx_str + self.transliterate_en_xx(
                            word, target_lang_code) + " ")
                        continue

                    if target_lang_code == "en_US" or \
                            target_lang_code == "en_IN":
                        tx_str = (tx_str + self.transliterate_xx_en(
                            word, src_lang_code) + " ")
                        continue

                    tx_str += self.transliterate_indic_indic(
                        word, src_lang_code, target_lang_code)

                    if len(line) > 1:
                        tx_str += " "

                else:
                    tx_str = tx_str + word
            if len(lines) > 1:
                tx_str += "\n"
        # Language specific fixes
        if target_lang_code == "ml_IN":
            tx_str = self._malayalam_fixes(tx_str)
        return tx_str
Exemple #3
0
 def guessLanguage(self, text):
     lang = guessLanguageName(text)
     if lang == 'UNKNOWN':
         firstWord = text.split()[0]
         lang = detect_lang(firstWord)[firstWord]
         lang = _getName(lang.split("_")[0])
     return lang
Exemple #4
0
 def guessLanguage(self, text):
     lang = guessLanguageName(text)
     if lang == 'UNKNOWN':
         firstWord = text.split()[0]
         lang = detect_lang(firstWord)[firstWord]
         lang = _getName(lang.split("_")[0])
     return lang
Exemple #5
0
    def syllabify(self, text):
        """
        syllabifies the given text

        :param text: the input  text.
        :type text: str.
        :returns: text with syllables marked.
        """
        if text.strip() == "":
            return []
        lang = detect_lang(text.split(" ")[0])[text.split(" ")[0]]
        if(lang == "ml_IN"):
            return self.syllabify_ml(text)
        if(lang == "hi_IN"):
            return self.syllabify_hi(text)
        if(lang == "kn_IN"):
            return self.syllabify_kn(text)
        if(lang == "bn_IN"):
            return self.syllabify_bn(text)
        if(lang == "ta_IN"):
            return self.syllabify_ta(text)
        if(lang == "en_US"):
            return self.syllabify_en(text)
        lst_chars = []

        for char in text:
            lst_chars.append(char)
        return lst_chars
Exemple #6
0
    def syllabify(self, text):
        """
        syllabifies the given text

        :param text: the input  text.
        :type text: str.
        :returns: text with syllables marked.
        """
        if text.strip() == "":
            return []
        lang = detect_lang(text.split(" ")[0])[text.split(" ")[0]]
        if (lang == "ml_IN"):
            return self.syllabify_ml(text)
        if (lang == "hi_IN"):
            return self.syllabify_hi(text)
        if (lang == "kn_IN"):
            return self.syllabify_kn(text)
        if (lang == "bn_IN"):
            return self.syllabify_bn(text)
        if (lang == "ta_IN"):
            return self.syllabify_ta(text)
        if (lang == "en_US"):
            return self.syllabify_en(text)
        lst_chars = []

        for char in text:
            lst_chars.append(char)
        return lst_chars
Exemple #7
0
    def transliterate(self, text, target_lang_code):
        """
        :param text: The text to be transliterated.
        :type text: str.
        :param target_lang_code: The language into which word has to be transliterated.
        :type target_lang_code: str.
        :returns: the translated text.

        The transliteration functioon which can transliterate text to the
        supported target languages.

        """
        tx_str = ""
        lines = text.split("\n")
        for line in lines:
            words = line.split(" ")
            for word in words:
                if word.strip() > "":
                    try:
                        src_lang_code = detect_lang(word)[word]
                    except:
                        tx_str = tx_str + " " + word
                        continue  # FIXME

                    if target_lang_code == "ISO15919":
                        tx_str = tx_str + self.transliterate_iso15919(word, src_lang_code) + " "
                        continue

                    if target_lang_code == "IPA":
                        tx_str = tx_str + self.transliterate_ipa(word, src_lang_code) + " "
                        continue

                    if src_lang_code == "en_US":
                        tx_str = tx_str + self.transliterate_en_xx(word, target_lang_code) + " "
                        continue

                    if target_lang_code == "en_US" or target_lang_code == "en_IN":
                        tx_str = tx_str + self.transliterate_xx_en(word, src_lang_code) + " "
                        continue

                    tx_str += self.transliterate_indic_indic(word, src_lang_code, target_lang_code)

                    if len(line) > 1:
                        tx_str += " "

                else:
                    tx_str = tx_str + word
            if len(lines) > 1:
                tx_str += "\n"
        # Language specific fixes
        if target_lang_code == "ml_IN":
            tx_str = self._malayalam_fixes(tx_str)
        return tx_str
Exemple #8
0
    def transliterate(self, text, target_lang_code):
        tx_str = ""
        lines = text.split("\n")
        for line in lines:
            words = line.split(" ")
            for word in words:
                if word.strip() > "":
                    try:
                        src_lang_code = detect_lang(word)[word]
                    except:
                        tx_str = tx_str + " " + word
                        continue  # FIXME

                    if target_lang_code == "ISO15919":
                        tx_str = tx_str + self.transliterate_iso15919(word, src_lang_code) + " "
                        continue

                    if target_lang_code == "IPA":
                        tx_str = tx_str + self.transliterate_ipa(word, src_lang_code) + " "
                        continue

                    if src_lang_code == "en_US":
                        tx_str = tx_str + self.transliterate_en_xx(word, target_lang_code) + " "
                        continue

                    if target_lang_code == "en_US" or target_lang_code == "en_IN":
                        tx_str = tx_str + self.transliterate_xx_en(word, src_lang_code) + " "
                        continue

                    tx_str += self.transliterate_indic_indic(word, src_lang_code, target_lang_code)

                    if len(line) > 1:
                        tx_str += " "

                else:
                    tx_str = tx_str + word
            if len(lines) > 1:
                tx_str += "\n"
        # Language specific fixes
        if target_lang_code == "ml_IN":
            tx_str = self._malayalam_fixes(tx_str)
        return tx_str
Exemple #9
0
 def getScriptName(self, text):
     return dumps(detect_lang(text))
Exemple #10
0
 def guessLanguageId(self, text):
     lang = guessLanguage(text)
     if lang == 'UNKNOWN':
         firstWord = text.split()[0]
         lang = detect_lang(firstWord)[firstWord]
     return lang
Exemple #11
0
 def getScriptName(self, text):
     return detect_lang(text)
Exemple #12
0
 def guessLanguageId(self, text):
     lang = guessLanguage(text)
     if lang == 'UNKNOWN':
         firstWord = text.split()[0]
         lang = detect_lang(firstWord)[firstWord]
     return lang