Esempio n. 1
0
 def suggestion_policy(self, word, suggs):
     # pick suggestions that are only +/- 2 letter length different
     filter_suggs = []
     tamil_length = lambda w: len(tamil.utf8.get_letters(w))
     ref_wl = tamil_length(word)
     accept_min_max = [max(ref_wl - 2, 1), ref_wl + 1]
     filter_suggs = filter(
         lambda w: tamil_length(w) >= accept_min_max[0] and len(w) <=
         accept_min_max[1],
         suggs,
     )
     # sort the suggestions by Dice coefficient
     filter_suggs = set(filter_suggs)
     if len(filter_suggs) == 0:
         # guess!
         filter_suggs = suggs
         filter_suggs = list(tamil.utf8.tamil_sorted(filter_suggs))
         filter_suggs[min(10, len(filter_suggs) - 1):] = []
         return filter_suggs
     _compare_fn = lambda wA, wB: (edit_distance(wA, word) < edit_distance(
         wB, word))
     filter_suggs = list(
         tamil.utf8.tamil_sorted(filter_suggs,
                                 key=functools.cmp_to_key(_compare_fn)))
     return filter_suggs
def get_min_distance_alternate( pizhai ):
    agarathi_sorkal = [u'அவிழ்',u'அவல்',u'அவள்',u'தவில்',u'தவள்']
    distances = map( lambda w: edit_distance( pizhai, w) , agarathi_sorkal )
    print(distances)
    m = min(distances)
    idx = -1
    matches = []
    while True:
        old_idx = idx
        try:
            idx = distances.index(m,1+old_idx,len(distances))
        except ValueError:
            break
        matches.append( agarathi_sorkal[idx] )
    return matches
Esempio n. 3
0
def get_min_distance_alternate(pizhai):
    agarathi_sorkal = ["அவிழ்", "அவல்", "அவள்", "தவில்", "தவள்"]
    distances = [edit_distance(pizhai, w) for w in agarathi_sorkal]
    print(distances)
    m = min(distances)
    idx = -1
    matches = []
    while True:
        old_idx = idx
        try:
            idx = distances.index(m, 1 + old_idx, len(distances))
        except ValueError:
            break
        matches.append(agarathi_sorkal[idx])
    return matches
Esempio n. 4
0
    def get_min_distance_alternate(self, pizhai):
        from ngram.Distance import edit_distance

        agarathi_sorkal = [u"அவிழ்", u"அவல்", u"அவள்", u"தவில்", u"தவள்"]
        distances = list(
            map(lambda w: edit_distance(pizhai, w), agarathi_sorkal))
        print(distances)
        m = min(distances)
        idx = -1
        matches = []
        while True:
            old_idx = idx
            try:
                idx = distances.index(m, 1 + old_idx, len(distances))
            except ValueError:
                break
            matches.append(agarathi_sorkal[idx])
        return matches
Esempio n. 5
0
    def check_word_and_suggest(self, word, errmsg=None):
        word = word.strip()
        # skip known punctuation at end of line
        while len(word) >= 1 and any(map(word.endswith, Speller.punctuation)):
            word = word[:-1]
        while len(word) >= 1 and any(map(word.startswith, string.whitespace)):
            word = word[1:]

        # is number then we propose a numeral
        if self.in_tamil_mode():
            numword = word.replace(u',', u'')
            if re.match(u'[+|-]*[\d]+', numword):
                try:
                    num = float(numword)
                    posnum = num
                    if num < 0:
                        posnum = -1 * num
                    numeral_form = tamil.numeral.num2tamilstr(posnum)
                    if num < 0:
                        numeral_form = u"கழித்தல் " + numeral_form
                    return (False, [numeral_form])
                except Exception as ioe:
                    pass

            # dates are okay
            if any(
                    map(word.endswith,
                        [u"-இல்", u"-ஆம்", u"-இலிருந்து", u"-வரை"])):
                if re.search('^\d+', word):
                    return (True, [word])  #word is okay

            # check if words are transliterated
            if any(
                    filter(lambda x: x in string.ascii_letters,
                           tamil.utf8.get_letters(word))):
                # letter-sequence only
                en_word = Speller.scrub_ws(word)
                EN_Lexicon = Speller.get_english_dictionary()
                if EN_Lexicon.isWord(en_word):
                    return (
                        False, ['']
                    )  #English word - nosub- yet until we have parallel dictionaries or translation. TBD.

                #is english letter
                ta = algorithm.Iterative.transliterate(
                    jaffna.Transliteration.table, en_word)
                # TBD: potential for having ANN to tell if english text is pure English word
                # or a romanized Tamil word. Output of classifier can be useful here.
                return (False, [ta])

            # check if it matches Tamil numeral and has close match.
            # propose suggestions from that list.
            # TBD

        # hyphens are not okay
        if word.find(u"-") >= 0:
            return (False, [word.replace(u"-",
                                         u" ")])  #re.sub(u"^w"," ",word))
        # replace other spurious ()[] punctuations by concatenation
        #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word)))
        orig_word = u"%s" % word

        # remove digits
        word = re.sub(u'\d+', u'', word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()
        self.add_numeral_words(TVU_dict)

        # Check if this 'word' is any common kind of error
        if Typographical.checkFormErrors(word, errmsg):
            if errmsg: errmsg.append("TypographicalError")

        if not self.checklang(word):
            print("Word is not in desired language!")
            return (False, [u""])

        if len(word) < 1:
            print("Word is too small")
            return (False, [u''])

        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True, word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply(word)
        if (self.isWord(word_nocase)):
            return (True, word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word, TVU_dict)
        greedy_results = list()
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt), u"-".join(alt)]
            greedy_results.extend(alt)
            #return (False, greedy_results )

        # if there are no other suggestions than deletion filter, we return
        # in presence of other suggestions we can just return suggestions
        suggs = DeletionFilter.get_suggestions(letters, TVU_dict)
        if len(suggs) > 0:
            if len(greedy_results) == 0:
                return (False, suggs)
            else:
                greedy_results.extend(suggs)

        # ottru splitting for Tamil language mode
        ottru_options = []
        if self.in_tamil_mode():
            # discover words like யாரிகழ்ந்து are accepted.
            ottru = OttruSplit(word, letters)
            ottru.run(TVU_dict)
            if len(ottru.results) > 0:
                return (True, word)
            ottru_options = ottru.results

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter(
            TVU_dict.isWord, norvig_suggestor(word,
                                              self.alphabets,
                                              2,
                                              limit=25))
        combinagram_suggests = list(
            tamil.wordutils.combinagrams(word, TVU_dict, limit=25))
        pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1]))

        # FIXME: score  the options
        options = greedy_results
        options.extend(ottru_options)
        options.extend(list(norvig_suggests))
        options.extend(combinagram_suggests)
        options.extend(pfx_options)

        # filter the options against a dictionary!
        options = filter(TVU_dict.isWord, options)
        if PYTHON3:
            options = list(options)

        if self.in_tamil_mode():
            options.extend(self.mayangoli_suggestions(orig_word, letters))

        # sort the options
        if not self.in_tamil_mode():
            options.sort()
        else:
            if PYTHON3:
                options = sorted(options,
                                 key=functools.cmp_to_key(
                                     tamil.utf8.compare_words_lexicographic))
            else:
                options = sorted(options,
                                 cmp=tamil.utf8.compare_words_lexicographic)

        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter(lambda x: len(tamil.utf8.get_letters(x)) > 2,
                             options)

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options
        if _DEBUG:
            print("@deduplication")
            pprint.pprint(options2)

        # score by Dice or Edit-Distance coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr, sugg_word in enumerate(options2):
            #options_score[itr] = Dice_coeff( word, sugg_word )
            options_score[itr] = (len(word) - edit_distance(
                word, sugg_word)) / (1.0 * len(orig_word)) * Dice_coeff(
                    word, sugg_word) / 3.0  #dice coeff is weighted down
        options = zip(options2, options_score)

        # limit options by score
        options = sorted(options, key=operator.itemgetter(1), reverse=True)
        options = [word_pair[0] for word_pair in options]
        #L = 40
        # limit to first top -L=20 only which is good enough
        #options = options[0:min(len(options),L)]
        if _DEBUG:
            pprint.pprint("@after scoring/sorting")
            pprint.pprint(options)

        # eliminate single letter options
        options = filter(lambda x: not (x in tamil.utf8.tamil_letters),
                         options)

        # Due to suggestion policy we may have words which are found in error but we dont have
        # replacements for them!

        # TBD: options should not have the 'word'!
        return (False, options)
Esempio n. 6
0
    def check_word_and_suggest( self,word, errmsg = None ):
        word = word.strip()
        # skip known punctuation at end of line
        while len(word) >= 1 and any(map(word.endswith,Speller.punctuation)):
            word = word[:-1]
        while len(word) >= 1 and any(map(word.startswith,string.whitespace)):
            word = word[1:]

        # is number then we propose a numeral
        if self.in_tamil_mode():
            numword = word.replace(u',',u'')
            if re.match(u'[+|-]*[\d]+',numword):
                try:
                    num = float(numword)
                    posnum = num
                    if num < 0:
                        posnum = -1*num
                    numeral_form = tamil.numeral.num2tamilstr(posnum)
                    if num < 0:
                        numeral_form = u"கழித்தல் "+numeral_form
                    return (False,[numeral_form])
                except Exception as ioe:
                    pass

            # dates are okay
            if any(map(word.endswith,[u"-இல்",u"-ஆம்",u"-இலிருந்து", u"-வரை"])):
                if re.search('^\d+',word):
                    return (True,[word]) #word is okay

            # check if words are transliterated
            if any(filter(lambda x: x in string.ascii_letters,tamil.utf8.get_letters(word))):
                # letter-sequence only
                en_word = Speller.scrub_ws(word)
                EN_Lexicon = Speller.get_english_dictionary()
                if EN_Lexicon.isWord(en_word):
                    return (False,['']) #English word - nosub- yet until we have parallel dictionaries or translation. TBD.

                #is english letter
                ta = algorithm.Iterative.transliterate(jaffna.Transliteration.table,en_word)
                # TBD: potential for having ANN to tell if english text is pure English word
                # or a romanized Tamil word. Output of classifier can be useful here.
                return (False,[ta])

            # check if it matches Tamil numeral and has close match.
            # propose suggestions from that list.
            # TBD

        # hyphens are not okay
        if word.find(u"-") >= 0:
            return (False,[word.replace(u"-",u" ")])#re.sub(u"^w"," ",word))
        # replace other spurious ()[] punctuations by concatenation
        #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word)))
        orig_word = u"%s"%word

        # remove digits
        word = re.sub(u'\d+',u'',word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()
        self.add_numeral_words(TVU_dict)

        # Check if this 'word' is any common kind of error
        if Typographical.checkFormErrors(word,errmsg):
            if errmsg: errmsg.append("TypographicalError")

        if not self.checklang(word):
            print("Word is not in desired language!")
            return (False,[u""])

        if len(word) < 1:
            print("Word is too small")
            return (False,[u''])

        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True,word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply( word )
        if ( self.isWord( word_nocase ) ):
            return (True,word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word,TVU_dict)
        greedy_results = list()
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt),u"-".join(alt)]
            greedy_results.extend(alt)
            #return (False, greedy_results )

        # if there are no other suggestions than deletion filter, we return
        # in presence of other suggestions we can just return suggestions
        suggs = DeletionFilter.get_suggestions(letters,TVU_dict)
        if len(suggs) > 0:
            if len(greedy_results) == 0:
                return (False,suggs)
            else:
                greedy_results.extend(suggs)

        # ottru splitting for Tamil language mode
        ottru_options = []
        if self.in_tamil_mode():
            # discover words like யாரிகழ்ந்து are accepted.
            ottru = OttruSplit(word,letters)
            ottru.run(TVU_dict)
            if len(ottru.results) > 0:
                return (True,word)
            ottru_options = ottru.results

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=25))
        combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=25))
        pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) )

        # FIXME: score  the options
        options = greedy_results
        options.extend( ottru_options )
        options.extend( list(norvig_suggests) )
        options.extend( combinagram_suggests )
        options.extend( pfx_options )

        # filter the options against a dictionary!
        options = filter(TVU_dict.isWord,options )
        if PYTHON3:
            options = list(options)

        if self.in_tamil_mode():
            options.extend( self.mayangoli_suggestions(orig_word,letters) )

        # sort the options
        if not self.in_tamil_mode():
            options.sort()
        else:
            if PYTHON3:
                options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) )
            else:
                options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic )

        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter( lambda x:  len(tamil.utf8.get_letters(x)) > 2, options )

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options
        if _DEBUG:
            print("@deduplication")
            pprint.pprint(options2)

        # score by Dice or Edit-Distance coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr,sugg_word in enumerate(options2):
            #options_score[itr] = Dice_coeff( word, sugg_word )
            options_score[itr] = (len(word)-edit_distance(word,sugg_word))/(1.0*len(orig_word))*Dice_coeff( word, sugg_word )/3.0 #dice coeff is weighted down
        options = zip( options2, options_score)

        # limit options by score
        options = sorted(options,key=operator.itemgetter(1),reverse=True)
        options = [word_pair[0] for word_pair in options]
        #L = 40
        # limit to first top -L=20 only which is good enough
        #options = options[0:min(len(options),L)]
        if _DEBUG:
            pprint.pprint("@after scoring/sorting")
            pprint.pprint(options)

        # eliminate single letter options
        options = filter(lambda x : not( x in tamil.utf8.tamil_letters), options)

        # Due to suggestion policy we may have words which are found in error but we dont have
        # replacements for them!

        # TBD: options should not have the 'word'!
        return (False, options )