def check(self, word, language=None): word = word.strip() if word == "": return None #If it is a number, don't do spelcheck if is_number(word): return True if self.lang != language: self.NWORDS = None if language == None: self.lang = _detect_lang(word)[word] else: self.lang = language if word == "": return True if self.NWORDS == None: self.NWORDS = self.get_wordlist(word) if self.NWORDS == None: # Dictionary not found return False result = word in self.NWORDS #if it is english word, try converting the first letter to lower case. #This will happen if the word is first word of a sentence if result == False and word.upper() != word.lower(): newword = word[0].lower() + word[1:] self.NWORDS = self.get_wordlist(newword) return newword in self.NWORDS else: return result
def guessLanguage(self,text): lang = guessLanguageName(text) if lang == 'UNKNOWN': firstWord = text.split()[0] lang = _detect_lang(firstWord)[firstWord] lang = _getName(lang.split("_")[0]) return lang
def check(self, word, language=None): word = word.strip() if word == "": return None #If it is a number, don't do spelcheck if is_number(word): return True if self.lang != language: self.NWORDS = None if language is None: self.lang = _detect_lang(word)[word] else: self.lang = language if word == "": return True if self.NWORDS is None: self.NWORDS = self.get_wordlist(word) if self.NWORDS is None: # Dictionary not found return False result = word in self.NWORDS #if it is english word, try converting the first letter to lower case. #This will happen if the word is first word of a sentence if result is False and word.upper() != word.lower(): newword = word[0].lower() + word[1:] self.NWORDS = self.get_wordlist(newword) return newword in self.NWORDS else: return result
def suggest(self, word, language=None, distance=2): """ Gives a list of words similar to the given word :param word: The word for which spelling suggestions are required. :type word: str. :param distance: suggestion will contain words with length =word length +/- distance :type distance: int :returns: A list of suggested spellings. >>> a.suggest(u"cate") [u'cat', u'cater', u'caters', u'cats'] """ word = word.strip() if word == "": return None if self.lang != language: self.NWORDS = None if language is None: self.lang = _detect_lang(word)[word] else: self.lang = language if self.NWORDS is None: self.NWORDS = self.get_wordlist(word) if word in self.NWORDS: return word candidates = [] for candidate in self.NWORDS: #skip if the first letter is different #if candidate[0] != word[0]: # continue ''' if the length difference is greater than the threshold distance, skip ''' if len(candidate) - len(word) > distance \ or len(word) - len(candidate) > distance: continue if not self.levenshtein(candidate, word) > distance: candidates.append(candidate) candidates = self.filter_candidates(word, candidates) if len(candidates) == 0: ''' try inserting spaces in between the letters to see if the word got merged ''' pos = 2 while pos < len(word) - 2: if self.check(word[:pos], self.lang) \ and self.check(word[pos:], self.lang): candidates.append(word[:pos] + " " + word[pos:]) candidates.append(word[:pos] + "-" + word[pos:]) pos += 1 return candidates
def transliterate(self, text, target_lang_code): tx_str = "" lines = text.split("\n") for line in lines: words = line.split(" ") for word in words: if(word.strip() > ""): try: src_lang_code = _detect_lang(word)[word] except: tx_str = tx_str + " " + word continue # FIXME if target_lang_code == "ISO15919": tx_str = tx_str + self.transliterate_iso15919(word, \ src_lang_code) + " " continue if target_lang_code == "IPA": tx_str = tx_str + self.transliterate_ipa(word, \ src_lang_code) + " " continue if src_lang_code == "en_US": tx_str = tx_str + self.transliterate_en_xx(word, \ target_lang_code) + " " continue if target_lang_code == "en_US" or \ target_lang_code == "en_IN": tx_str = tx_str + self.transliterate_xx_en(word, \ src_lang_code) + " " continue tx_str += self.transliterate_indic_indic(word, \ src_lang_code, target_lang_code) if len(lines) > 1: tx_str += " " else: tx_str = tx_str + word if len(lines) > 1: tx_str += "\n" # Language specific fixes if target_lang_code == "ml_IN": tx_str = self._malayalam_fixes(tx_str) return tx_str
def suggest(self, word, language=None, distance=2): word = word.strip() if word == "": return None if self.lang != language: self.NWORDS = None if language == None: self.lang = _detect_lang(word)[word] else: self.lang = language if self.NWORDS == None: self.NWORDS = self.get_wordlist(word) if word in self.NWORDS: return word candidates = [] for candidate in self.NWORDS: #skip if the first letter is different #if candidate[0] != word[0]: # continue ''' if the length difference is greater than the threshold distance, skip ''' if len(candidate) - len(word) > distance \ or len(word) - len(candidate) > distance: continue if not self.levenshtein(candidate, word) > distance: candidates.append(candidate) candidates = self.filter_candidates(word, candidates) if len(candidates) == 0: ''' try inserting spaces in between the letters to see if the word got merged ''' pos = 2 while pos < len(word) - 2: if self.check(word[:pos], self.lang) \ and self.check(word[pos:], self.lang): candidates.append(word[:pos] + " " + word[pos:]) candidates.append(word[:pos] + "-" + word[pos:]) pos += 1 return candidates
def suggest(self, word, language=None, distance=2): word = word.strip() if word == "": return None if self.lang != language: self.NWORDS = None if language is None: self.lang = _detect_lang(word)[word] else: self.lang = language if self.NWORDS is None: self.NWORDS = self.get_wordlist(word) if word in self.NWORDS: return word candidates = [] for candidate in self.NWORDS: #skip if the first letter is different #if candidate[0] != word[0]: # continue ''' if the length difference is greater than the threshold distance, skip ''' if len(candidate) - len(word) > distance \ or len(word) - len(candidate) > distance: continue if not self.levenshtein(candidate, word) > distance: candidates.append(candidate) candidates = self.filter_candidates(word, candidates) if len(candidates) == 0: ''' try inserting spaces in between the letters to see if the word got merged ''' pos = 2 while pos < len(word) - 2: if self.check(word[:pos], self.lang) \ and self.check(word[pos:], self.lang): candidates.append(word[:pos] + " " + word[pos:]) candidates.append(word[:pos] + "-" + word[pos:]) pos += 1 return candidates
def check(self, word, language=None): """ Checks whether given word has correct spelling. :param word: The word whose spelling tis to be checked. :type word: str. :param language: *optional* language code for the word. :type languge: str. :returns: Boolean True or False >>> a.check(u"അംഗദന്") True """ word = word.strip() if word == "": return None #If it is a number, don't do spelcheck if is_number(word): return True if self.lang != language: self.NWORDS = None if language is None: self.lang = _detect_lang(word)[word] else: self.lang = language if word == "": return True if self.NWORDS is None: self.NWORDS = self.get_wordlist(word) if self.NWORDS is None: # Dictionary not found return False result = word in self.NWORDS #if it is english word, try converting the first letter to lower case. #This will happen if the word is first word of a sentence if result is False and word.upper() != word.lower(): newword = word[0].lower() + word[1:] self.NWORDS = self.get_wordlist(newword) return newword in self.NWORDS else: return result
def getScriptName(self,text): return dumps(_detect_lang(text))
def guessLanguageId(self,text): lang = guessLanguage(text) if lang == 'UNKNOWN': firstWord = text.split()[0] lang = _detect_lang(firstWord)[firstWord] return lang