def suggestion_policy(self, word, suggs): # pick suggestions that are only +/- 2 letter length different filter_suggs = [] tamil_length = lambda w: len(tamil.utf8.get_letters(w)) ref_wl = tamil_length(word) accept_min_max = [max(ref_wl - 2, 1), ref_wl + 1] filter_suggs = filter( lambda w: tamil_length(w) >= accept_min_max[0] and len(w) <= accept_min_max[1], suggs, ) # sort the suggestions by Dice coefficient filter_suggs = set(filter_suggs) if len(filter_suggs) == 0: # guess! filter_suggs = suggs filter_suggs = list(tamil.utf8.tamil_sorted(filter_suggs)) filter_suggs[min(10, len(filter_suggs) - 1):] = [] return filter_suggs _compare_fn = lambda wA, wB: (edit_distance(wA, word) < edit_distance( wB, word)) filter_suggs = list( tamil.utf8.tamil_sorted(filter_suggs, key=functools.cmp_to_key(_compare_fn))) return filter_suggs
def get_min_distance_alternate( pizhai ): agarathi_sorkal = [u'அவிழ்',u'அவல்',u'அவள்',u'தவில்',u'தவள்'] distances = map( lambda w: edit_distance( pizhai, w) , agarathi_sorkal ) print(distances) m = min(distances) idx = -1 matches = [] while True: old_idx = idx try: idx = distances.index(m,1+old_idx,len(distances)) except ValueError: break matches.append( agarathi_sorkal[idx] ) return matches
def get_min_distance_alternate(pizhai): agarathi_sorkal = ["அவிழ்", "அவல்", "அவள்", "தவில்", "தவள்"] distances = [edit_distance(pizhai, w) for w in agarathi_sorkal] print(distances) m = min(distances) idx = -1 matches = [] while True: old_idx = idx try: idx = distances.index(m, 1 + old_idx, len(distances)) except ValueError: break matches.append(agarathi_sorkal[idx]) return matches
def get_min_distance_alternate(self, pizhai): from ngram.Distance import edit_distance agarathi_sorkal = [u"அவிழ்", u"அவல்", u"அவள்", u"தவில்", u"தவள்"] distances = list( map(lambda w: edit_distance(pizhai, w), agarathi_sorkal)) print(distances) m = min(distances) idx = -1 matches = [] while True: old_idx = idx try: idx = distances.index(m, 1 + old_idx, len(distances)) except ValueError: break matches.append(agarathi_sorkal[idx]) return matches
def check_word_and_suggest(self, word, errmsg=None): word = word.strip() # skip known punctuation at end of line while len(word) >= 1 and any(map(word.endswith, Speller.punctuation)): word = word[:-1] while len(word) >= 1 and any(map(word.startswith, string.whitespace)): word = word[1:] # is number then we propose a numeral if self.in_tamil_mode(): numword = word.replace(u',', u'') if re.match(u'[+|-]*[\d]+', numword): try: num = float(numword) posnum = num if num < 0: posnum = -1 * num numeral_form = tamil.numeral.num2tamilstr(posnum) if num < 0: numeral_form = u"கழித்தல் " + numeral_form return (False, [numeral_form]) except Exception as ioe: pass # dates are okay if any( map(word.endswith, [u"-இல்", u"-ஆம்", u"-இலிருந்து", u"-வரை"])): if re.search('^\d+', word): return (True, [word]) #word is okay # check if words are transliterated if any( filter(lambda x: x in string.ascii_letters, tamil.utf8.get_letters(word))): # letter-sequence only en_word = Speller.scrub_ws(word) EN_Lexicon = Speller.get_english_dictionary() if EN_Lexicon.isWord(en_word): return ( False, [''] ) #English word - nosub- yet until we have parallel dictionaries or translation. TBD. #is english letter ta = algorithm.Iterative.transliterate( jaffna.Transliteration.table, en_word) # TBD: potential for having ANN to tell if english text is pure English word # or a romanized Tamil word. Output of classifier can be useful here. return (False, [ta]) # check if it matches Tamil numeral and has close match. # propose suggestions from that list. # TBD # hyphens are not okay if word.find(u"-") >= 0: return (False, [word.replace(u"-", u" ")]) #re.sub(u"^w"," ",word)) # replace other spurious ()[] punctuations by concatenation #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word))) orig_word = u"%s" % word # remove digits word = re.sub(u'\d+', u'', word) letters = tamil.utf8.get_letters(word) TVU_dict = self.get_lang_dictionary() self.add_numeral_words(TVU_dict) # Check if this 'word' is any common kind of error if Typographical.checkFormErrors(word, errmsg): if errmsg: errmsg.append("TypographicalError") if not self.checklang(word): print("Word is not in desired language!") return (False, [u""]) if len(word) < 1: print("Word is too small") return (False, [u'']) # plain old dictionary + user dictionary check if self.isWord(word): return (True, word) # Remove case and redo the dictionary + user check word_nocase = self.case_filter.apply(word) if (self.isWord(word_nocase)): return (True, word_nocase) else: word = word_nocase # Consider splitting the word and see if it has 2 sub-words # e.g. செயல்பட => செயல் + பட alt = tamil.wordutils.greedy_split(word, TVU_dict) greedy_results = list() if len(alt) >= 1: greedy_results = [u" ".join(alt), u"-".join(alt)] greedy_results.extend(alt) #return (False, greedy_results ) # if there are no other suggestions than deletion filter, we return # in presence of other suggestions we can just return suggestions suggs = DeletionFilter.get_suggestions(letters, TVU_dict) if len(suggs) > 0: if len(greedy_results) == 0: return (False, suggs) else: greedy_results.extend(suggs) # ottru splitting for Tamil language mode ottru_options = [] if self.in_tamil_mode(): # discover words like யாரிகழ்ந்து are accepted. ottru = OttruSplit(word, letters) ottru.run(TVU_dict) if len(ottru.results) > 0: return (True, word) ottru_options = ottru.results # TODO: Noun Declension - ticket- # suggestions at edit distance 1 norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor(word, self.alphabets, 2, limit=25)) combinagram_suggests = list( tamil.wordutils.combinagrams(word, TVU_dict, limit=25)) pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1])) # FIXME: score the options options = greedy_results options.extend(ottru_options) options.extend(list(norvig_suggests)) options.extend(combinagram_suggests) options.extend(pfx_options) # filter the options against a dictionary! options = filter(TVU_dict.isWord, options) if PYTHON3: options = list(options) if self.in_tamil_mode(): options.extend(self.mayangoli_suggestions(orig_word, letters)) # sort the options if not self.in_tamil_mode(): options.sort() else: if PYTHON3: options = sorted(options, key=functools.cmp_to_key( tamil.utf8.compare_words_lexicographic)) else: options = sorted(options, cmp=tamil.utf8.compare_words_lexicographic) # remove replacements with single-letter words WL = len(tamil.utf8.get_letters(word)) if WL > 3: options = filter(lambda x: len(tamil.utf8.get_letters(x)) > 2, options) # remove dupes in list options2 = [] prev = None for val in options: if val.strip() != prev: options2.append(val.strip()) prev = val.strip() del options if _DEBUG: print("@deduplication") pprint.pprint(options2) # score by Dice or Edit-Distance coefficients options_score = [0.0 for i in range(len(options2))] for itr, sugg_word in enumerate(options2): #options_score[itr] = Dice_coeff( word, sugg_word ) options_score[itr] = (len(word) - edit_distance( word, sugg_word)) / (1.0 * len(orig_word)) * Dice_coeff( word, sugg_word) / 3.0 #dice coeff is weighted down options = zip(options2, options_score) # limit options by score options = sorted(options, key=operator.itemgetter(1), reverse=True) options = [word_pair[0] for word_pair in options] #L = 40 # limit to first top -L=20 only which is good enough #options = options[0:min(len(options),L)] if _DEBUG: pprint.pprint("@after scoring/sorting") pprint.pprint(options) # eliminate single letter options options = filter(lambda x: not (x in tamil.utf8.tamil_letters), options) # Due to suggestion policy we may have words which are found in error but we dont have # replacements for them! # TBD: options should not have the 'word'! return (False, options)
def check_word_and_suggest( self,word, errmsg = None ): word = word.strip() # skip known punctuation at end of line while len(word) >= 1 and any(map(word.endswith,Speller.punctuation)): word = word[:-1] while len(word) >= 1 and any(map(word.startswith,string.whitespace)): word = word[1:] # is number then we propose a numeral if self.in_tamil_mode(): numword = word.replace(u',',u'') if re.match(u'[+|-]*[\d]+',numword): try: num = float(numword) posnum = num if num < 0: posnum = -1*num numeral_form = tamil.numeral.num2tamilstr(posnum) if num < 0: numeral_form = u"கழித்தல் "+numeral_form return (False,[numeral_form]) except Exception as ioe: pass # dates are okay if any(map(word.endswith,[u"-இல்",u"-ஆம்",u"-இலிருந்து", u"-வரை"])): if re.search('^\d+',word): return (True,[word]) #word is okay # check if words are transliterated if any(filter(lambda x: x in string.ascii_letters,tamil.utf8.get_letters(word))): # letter-sequence only en_word = Speller.scrub_ws(word) EN_Lexicon = Speller.get_english_dictionary() if EN_Lexicon.isWord(en_word): return (False,['']) #English word - nosub- yet until we have parallel dictionaries or translation. TBD. #is english letter ta = algorithm.Iterative.transliterate(jaffna.Transliteration.table,en_word) # TBD: potential for having ANN to tell if english text is pure English word # or a romanized Tamil word. Output of classifier can be useful here. return (False,[ta]) # check if it matches Tamil numeral and has close match. # propose suggestions from that list. # TBD # hyphens are not okay if word.find(u"-") >= 0: return (False,[word.replace(u"-",u" ")])#re.sub(u"^w"," ",word)) # replace other spurious ()[] punctuations by concatenation #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word))) orig_word = u"%s"%word # remove digits word = re.sub(u'\d+',u'',word) letters = tamil.utf8.get_letters(word) TVU_dict = self.get_lang_dictionary() self.add_numeral_words(TVU_dict) # Check if this 'word' is any common kind of error if Typographical.checkFormErrors(word,errmsg): if errmsg: errmsg.append("TypographicalError") if not self.checklang(word): print("Word is not in desired language!") return (False,[u""]) if len(word) < 1: print("Word is too small") return (False,[u'']) # plain old dictionary + user dictionary check if self.isWord(word): return (True,word) # Remove case and redo the dictionary + user check word_nocase = self.case_filter.apply( word ) if ( self.isWord( word_nocase ) ): return (True,word_nocase) else: word = word_nocase # Consider splitting the word and see if it has 2 sub-words # e.g. செயல்பட => செயல் + பட alt = tamil.wordutils.greedy_split(word,TVU_dict) greedy_results = list() if len(alt) >= 1: greedy_results = [u" ".join(alt),u"-".join(alt)] greedy_results.extend(alt) #return (False, greedy_results ) # if there are no other suggestions than deletion filter, we return # in presence of other suggestions we can just return suggestions suggs = DeletionFilter.get_suggestions(letters,TVU_dict) if len(suggs) > 0: if len(greedy_results) == 0: return (False,suggs) else: greedy_results.extend(suggs) # ottru splitting for Tamil language mode ottru_options = [] if self.in_tamil_mode(): # discover words like யாரிகழ்ந்து are accepted. ottru = OttruSplit(word,letters) ottru.run(TVU_dict) if len(ottru.results) > 0: return (True,word) ottru_options = ottru.results # TODO: Noun Declension - ticket- # suggestions at edit distance 1 norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=25)) combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=25)) pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) ) # FIXME: score the options options = greedy_results options.extend( ottru_options ) options.extend( list(norvig_suggests) ) options.extend( combinagram_suggests ) options.extend( pfx_options ) # filter the options against a dictionary! options = filter(TVU_dict.isWord,options ) if PYTHON3: options = list(options) if self.in_tamil_mode(): options.extend( self.mayangoli_suggestions(orig_word,letters) ) # sort the options if not self.in_tamil_mode(): options.sort() else: if PYTHON3: options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) ) else: options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic ) # remove replacements with single-letter words WL = len(tamil.utf8.get_letters(word)) if WL > 3: options = filter( lambda x: len(tamil.utf8.get_letters(x)) > 2, options ) # remove dupes in list options2 = [] prev = None for val in options: if val.strip() != prev: options2.append(val.strip()) prev = val.strip() del options if _DEBUG: print("@deduplication") pprint.pprint(options2) # score by Dice or Edit-Distance coefficients options_score = [0.0 for i in range(len(options2))] for itr,sugg_word in enumerate(options2): #options_score[itr] = Dice_coeff( word, sugg_word ) options_score[itr] = (len(word)-edit_distance(word,sugg_word))/(1.0*len(orig_word))*Dice_coeff( word, sugg_word )/3.0 #dice coeff is weighted down options = zip( options2, options_score) # limit options by score options = sorted(options,key=operator.itemgetter(1),reverse=True) options = [word_pair[0] for word_pair in options] #L = 40 # limit to first top -L=20 only which is good enough #options = options[0:min(len(options),L)] if _DEBUG: pprint.pprint("@after scoring/sorting") pprint.pprint(options) # eliminate single letter options options = filter(lambda x : not( x in tamil.utf8.tamil_letters), options) # Due to suggestion policy we may have words which are found in error but we dont have # replacements for them! # TBD: options should not have the 'word'! return (False, options )