Beispiel #1
0
    def check(self, word, language=None):
        word = word.strip()
        if word == "":
            return None
        #If it is a number, don't do spelcheck
        if is_number(word):
            return True
        if self.lang != language:
            self.NWORDS = None
        if language == None:
            self.lang = _detect_lang(word)[word]
        else:
            self.lang = language
        if word == "":
            return True

        if self.NWORDS == None:
            self.NWORDS = self.get_wordlist(word)
        if self.NWORDS == None:
            # Dictionary not found
            return False
        result = word in self.NWORDS
        #if it is english word, try converting the first letter to lower case.
        #This will happen if the word is first word of a sentence
        if result == False and word.upper() != word.lower():
            newword = word[0].lower() + word[1:]
            self.NWORDS = self.get_wordlist(newword)
            return newword in self.NWORDS
        else:
            return result
Beispiel #2
0
 def guessLanguage(self,text):
     lang = guessLanguageName(text)
     if lang ==  'UNKNOWN':
         firstWord = text.split()[0] 
         lang = _detect_lang(firstWord)[firstWord]
         lang = _getName(lang.split("_")[0])
     return lang 
Beispiel #3
0
    def check(self, word, language=None):
        word = word.strip()
        if word == "":
            return None
        #If it is a number, don't do spelcheck
        if is_number(word):
            return True
        if self.lang != language:
            self.NWORDS = None
        if language is None:
            self.lang = _detect_lang(word)[word]
        else:
            self.lang = language
        if word == "":
            return True

        if self.NWORDS is None:
            self.NWORDS = self.get_wordlist(word)
        if self.NWORDS is None:
            # Dictionary not found
            return False
        result = word in self.NWORDS
        #if it is english word, try converting the first letter to lower case.
        #This will happen if the word is first word of a sentence
        if result is False and word.upper() != word.lower():
            newword = word[0].lower() + word[1:]
            self.NWORDS = self.get_wordlist(newword)
            return newword in self.NWORDS
        else:
            return result
Beispiel #4
0
    def suggest(self, word, language=None, distance=2):
        """
        Gives a list of words similar to the given word

        :param word: The word for which  spelling suggestions are required.
        :type word: str.
        :param distance: suggestion will contain words with length =word length +/-  distance
        :type distance: int
        :returns: A list of suggested spellings.

         >>> a.suggest(u"cate")
         [u'cat', u'cater', u'caters', u'cats']

        """
        word = word.strip()
        if word == "":
            return None
        if self.lang != language:
            self.NWORDS = None
        if language is None:
            self.lang = _detect_lang(word)[word]
        else:
            self.lang = language
        if self.NWORDS is None:
            self.NWORDS = self.get_wordlist(word)
        if word in self.NWORDS:
            return word
        candidates = []
        for candidate in self.NWORDS:
            #skip if the first letter is different
            #if candidate[0] != word[0]:
            #    continue
            '''
            if the length difference is greater than
            the threshold distance, skip
            '''
            if len(candidate) - len(word) > distance \
                    or len(word) - len(candidate) > distance:
                continue
            if not self.levenshtein(candidate, word) > distance:
                candidates.append(candidate)
        candidates = self.filter_candidates(word, candidates)
        if len(candidates) == 0:
            '''
            try inserting spaces in between the letters
            to see if the word got merged
            '''
            pos = 2
            while pos < len(word) - 2:
                if self.check(word[:pos], self.lang) \
                        and self.check(word[pos:], self.lang):
                    candidates.append(word[:pos] + " " + word[pos:])
                    candidates.append(word[:pos] + "-" + word[pos:])
                pos += 1
        return candidates
Beispiel #5
0
    def transliterate(self, text, target_lang_code):
        tx_str = ""
        lines = text.split("\n")
        for line in lines:
            words = line.split(" ")
            for word in words:
                if(word.strip() > ""):
                    try:
                        src_lang_code = _detect_lang(word)[word]
                    except:
                        tx_str = tx_str + " " + word
                        continue  # FIXME

                    if target_lang_code == "ISO15919":
                        tx_str = tx_str + self.transliterate_iso15919(word, \
                                src_lang_code) + " "
                        continue

                    if target_lang_code == "IPA":
                        tx_str = tx_str + self.transliterate_ipa(word, \
                                src_lang_code) + " "
                        continue

                    if src_lang_code == "en_US":
                        tx_str = tx_str + self.transliterate_en_xx(word, \
                                target_lang_code) + " "
                        continue

                    if target_lang_code == "en_US" or \
                            target_lang_code == "en_IN":
                        tx_str = tx_str + self.transliterate_xx_en(word, \
                                src_lang_code) + " "
                        continue

                    tx_str += self.transliterate_indic_indic(word, \
                            src_lang_code, target_lang_code)

                    if len(lines) > 1:
                        tx_str += " "

                else:
                    tx_str = tx_str + word
            if len(lines) > 1:
                tx_str += "\n"
        # Language specific fixes
        if target_lang_code == "ml_IN":
            tx_str = self._malayalam_fixes(tx_str)
        return  tx_str
Beispiel #6
0
 def suggest(self, word, language=None, distance=2):
     word = word.strip()
     if word == "":
         return None
     if self.lang != language:
         self.NWORDS = None
     if language == None:
         self.lang = _detect_lang(word)[word]
     else:
         self.lang = language
     if self.NWORDS == None:
         self.NWORDS = self.get_wordlist(word)
     if word in self.NWORDS:
         return word
     candidates = []
     for candidate in self.NWORDS:
         #skip if the first letter is different
         #if candidate[0] != word[0]:
         #    continue
         '''
         if the length difference is greater than
         the threshold distance, skip
         '''
         if len(candidate) - len(word) > distance \
                 or len(word) - len(candidate) > distance:
             continue
         if not self.levenshtein(candidate, word) > distance:
             candidates.append(candidate)
     candidates = self.filter_candidates(word, candidates)
     if len(candidates) == 0:
         '''
         try inserting spaces in between the letters
         to see if the word got merged
         '''
         pos = 2
         while pos < len(word) - 2:
             if self.check(word[:pos], self.lang) \
                     and self.check(word[pos:], self.lang):
                 candidates.append(word[:pos] + " " + word[pos:])
                 candidates.append(word[:pos] + "-" + word[pos:])
             pos += 1
     return candidates
Beispiel #7
0
 def suggest(self, word, language=None, distance=2):
     word = word.strip()
     if word == "":
         return None
     if self.lang != language:
         self.NWORDS = None
     if language is None:
         self.lang = _detect_lang(word)[word]
     else:
         self.lang = language
     if self.NWORDS is None:
         self.NWORDS = self.get_wordlist(word)
     if word in self.NWORDS:
         return word
     candidates = []
     for candidate in self.NWORDS:
         #skip if the first letter is different
         #if candidate[0] != word[0]:
         #    continue
         '''
         if the length difference is greater than
         the threshold distance, skip
         '''
         if len(candidate) - len(word) > distance \
                 or len(word) - len(candidate) > distance:
             continue
         if not self.levenshtein(candidate, word) > distance:
             candidates.append(candidate)
     candidates = self.filter_candidates(word, candidates)
     if len(candidates) == 0:
         '''
         try inserting spaces in between the letters
         to see if the word got merged
         '''
         pos = 2
         while pos < len(word) - 2:
             if self.check(word[:pos], self.lang) \
                     and self.check(word[pos:], self.lang):
                 candidates.append(word[:pos] + " " + word[pos:])
                 candidates.append(word[:pos] + "-" + word[pos:])
             pos += 1
     return candidates
Beispiel #8
0
    def check(self, word, language=None):
        """
        Checks whether given word has correct spelling.

        :param word: The word whose spelling tis to be checked.
        :type word: str.
        :param language: *optional* language code for the word.
        :type languge: str.
        :returns: Boolean True or False

         >>> a.check(u"അംഗദന്‍")
         True
        """
        word = word.strip()
        if word == "":
            return None
        #If it is a number, don't do spelcheck
        if is_number(word):
            return True
        if self.lang != language:
            self.NWORDS = None
        if language is None:
            self.lang = _detect_lang(word)[word]
        else:
            self.lang = language
        if word == "":
            return True

        if self.NWORDS is None:
            self.NWORDS = self.get_wordlist(word)
        if self.NWORDS is None:
            # Dictionary not found
            return False
        result = word in self.NWORDS
        #if it is english word, try converting the first letter to lower case.
        #This will happen if the word is first word of a sentence
        if result is False and word.upper() != word.lower():
            newword = word[0].lower() + word[1:]
            self.NWORDS = self.get_wordlist(newword)
            return newword in self.NWORDS
        else:
            return result
Beispiel #9
0
 def getScriptName(self,text):
     return  dumps(_detect_lang(text))
Beispiel #10
0
 def guessLanguageId(self,text):
     lang = guessLanguage(text)
     if lang ==  'UNKNOWN':
         firstWord = text.split()[0] 
         lang = _detect_lang(firstWord)[firstWord]
     return lang