def test_Norvig_suggestor(self):
     word = u"ஆங்கிலம்"
     opts1 = norvig_suggestor(word, None, 1)
     # too much memory
     # opts2 = norvig_suggestor( word, None, 2)
     opts2 = []
     self.assertEqual(list(map(len, [opts1, opts2])), [5150, 0])
     return
Exemple #2
0
 def tamil_Norvig_correct_spelling(self, word, limits=[]):
     suggested_words = []
     try:
         if not self.tamilwordchecker.tamil_word_exists(word):
             for distance in range(1, self.edit_distance + 1):
                 suggested_words.extend( list(filter(self.tamilwordchecker.tamil_word_exists,\
                 norvig_suggestor( word,tamil.utf8.tamil247, nedits=distance,limit=1000))) )
             if len(limits) == 2:
                 suggested_words = list(
                     filter(
                         lambda x: len(tamil.utf8.get_letters(x)) >= limits[
                             0],
                         suggested_words))  # filter for words >= 4 letters
                 suggested_words = list(
                     filter(
                         lambda x: len(tamil.utf8.get_letters(x)) <= limits[
                             1],
                         suggested_words))  # and for words <= 6 letters
     except Exception as e:
         track = traceback.format_exc()
         print(track)
     return suggested_words
Exemple #3
0
    def check_word_and_suggest(self, word, errmsg=None):
        word = word.strip()
        # skip known punctuation at end of line
        while len(word) >= 1 and any(map(word.endswith, Speller.punctuation)):
            word = word[:-1]
        while len(word) >= 1 and any(map(word.startswith, string.whitespace)):
            word = word[1:]

        # is number then we propose a numeral
        if self.in_tamil_mode():
            numword = word.replace(u',', u'')
            if re.match(u'[+|-]*[\d]+', numword):
                try:
                    num = float(numword)
                    posnum = num
                    if num < 0:
                        posnum = -1 * num
                    numeral_form = tamil.numeral.num2tamilstr(posnum)
                    if num < 0:
                        numeral_form = u"கழித்தல் " + numeral_form
                    return (False, [numeral_form])
                except Exception as ioe:
                    pass

            # dates are okay
            if any(
                    map(word.endswith,
                        [u"-இல்", u"-ஆம்", u"-இலிருந்து", u"-வரை"])):
                if re.search('^\d+', word):
                    return (True, [word])  #word is okay

            # check if words are transliterated
            if any(
                    filter(lambda x: x in string.ascii_letters,
                           tamil.utf8.get_letters(word))):
                # letter-sequence only
                en_word = Speller.scrub_ws(word)
                EN_Lexicon = Speller.get_english_dictionary()
                if EN_Lexicon.isWord(en_word):
                    return (
                        False, ['']
                    )  #English word - nosub- yet until we have parallel dictionaries or translation. TBD.

                #is english letter
                ta = algorithm.Iterative.transliterate(
                    jaffna.Transliteration.table, en_word)
                # TBD: potential for having ANN to tell if english text is pure English word
                # or a romanized Tamil word. Output of classifier can be useful here.
                return (False, [ta])

            # check if it matches Tamil numeral and has close match.
            # propose suggestions from that list.
            # TBD

        # hyphens are not okay
        if word.find(u"-") >= 0:
            return (False, [word.replace(u"-",
                                         u" ")])  #re.sub(u"^w"," ",word))
        # replace other spurious ()[] punctuations by concatenation
        #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word)))
        orig_word = u"%s" % word

        # remove digits
        word = re.sub(u'\d+', u'', word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()
        self.add_numeral_words(TVU_dict)

        # Check if this 'word' is any common kind of error
        if Typographical.checkFormErrors(word, errmsg):
            if errmsg: errmsg.append("TypographicalError")

        if not self.checklang(word):
            print("Word is not in desired language!")
            return (False, [u""])

        if len(word) < 1:
            print("Word is too small")
            return (False, [u''])

        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True, word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply(word)
        if (self.isWord(word_nocase)):
            return (True, word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word, TVU_dict)
        greedy_results = list()
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt), u"-".join(alt)]
            greedy_results.extend(alt)
            #return (False, greedy_results )

        # if there are no other suggestions than deletion filter, we return
        # in presence of other suggestions we can just return suggestions
        suggs = DeletionFilter.get_suggestions(letters, TVU_dict)
        if len(suggs) > 0:
            if len(greedy_results) == 0:
                return (False, suggs)
            else:
                greedy_results.extend(suggs)

        # ottru splitting for Tamil language mode
        ottru_options = []
        if self.in_tamil_mode():
            # discover words like யாரிகழ்ந்து are accepted.
            ottru = OttruSplit(word, letters)
            ottru.run(TVU_dict)
            if len(ottru.results) > 0:
                return (True, word)
            ottru_options = ottru.results

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter(
            TVU_dict.isWord, norvig_suggestor(word,
                                              self.alphabets,
                                              2,
                                              limit=25))
        combinagram_suggests = list(
            tamil.wordutils.combinagrams(word, TVU_dict, limit=25))
        pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1]))

        # FIXME: score  the options
        options = greedy_results
        options.extend(ottru_options)
        options.extend(list(norvig_suggests))
        options.extend(combinagram_suggests)
        options.extend(pfx_options)

        # filter the options against a dictionary!
        options = filter(TVU_dict.isWord, options)
        if PYTHON3:
            options = list(options)

        if self.in_tamil_mode():
            options.extend(self.mayangoli_suggestions(orig_word, letters))

        # sort the options
        if not self.in_tamil_mode():
            options.sort()
        else:
            if PYTHON3:
                options = sorted(options,
                                 key=functools.cmp_to_key(
                                     tamil.utf8.compare_words_lexicographic))
            else:
                options = sorted(options,
                                 cmp=tamil.utf8.compare_words_lexicographic)

        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter(lambda x: len(tamil.utf8.get_letters(x)) > 2,
                             options)

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options
        if _DEBUG:
            print("@deduplication")
            pprint.pprint(options2)

        # score by Dice or Edit-Distance coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr, sugg_word in enumerate(options2):
            #options_score[itr] = Dice_coeff( word, sugg_word )
            options_score[itr] = (len(word) - edit_distance(
                word, sugg_word)) / (1.0 * len(orig_word)) * Dice_coeff(
                    word, sugg_word) / 3.0  #dice coeff is weighted down
        options = zip(options2, options_score)

        # limit options by score
        options = sorted(options, key=operator.itemgetter(1), reverse=True)
        options = [word_pair[0] for word_pair in options]
        #L = 40
        # limit to first top -L=20 only which is good enough
        #options = options[0:min(len(options),L)]
        if _DEBUG:
            pprint.pprint("@after scoring/sorting")
            pprint.pprint(options)

        # eliminate single letter options
        options = filter(lambda x: not (x in tamil.utf8.tamil_letters),
                         options)

        # Due to suggestion policy we may have words which are found in error but we dont have
        # replacements for them!

        # TBD: options should not have the 'word'!
        return (False, options)
Exemple #4
0
 def check_word_and_suggest( self,word ):         
     word = word.strip()
     # remove punctuation
     for x in string.punctuation:
         word = word.replace(x,u"")
     # remove digits
     word = re.sub(u'\d+',u'',word)
     letters = tamil.utf8.get_letters(word)
     TVU_dict = self.get_lang_dictionary()
     
     if not self.checklang(word):
         return (False,[u''])
     
     if len(word) < 1:
         return (False,[u''])
     
     # plain old dictionary + user dictionary check
     if self.isWord(word):
         return (True,word)
     
     # Remove case and redo the dictionary + user check
     word_nocase = self.case_filter.apply( word )
     if ( self.isWord( word_nocase ) ):
         return (True,word_nocase)
     else:
         word = word_nocase
     
     # Consider splitting the word and see if it has 2 sub-words
     # e.g. செயல்பட => செயல் + பட
     alt = tamil.wordutils.greedy_split(word,TVU_dict)
     if len(alt) >= 1:
         greedy_results = [u" ".join(alt),u"-".join(alt)]
         greedy_results.extend(alt)
     #return (False, results )
     else:
         greedy_results = list()
     
     # TODO: Noun Declension - ticket-
     
     # suggestions at edit distance 1
     norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=50))
     combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=50)) 
     pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) )
     
     # FIXME: score  the options
     options = greedy_results
     options.extend( list(norvig_suggests))
     options.extend( combinagram_suggests )
     options.extend( pfx_options )
     
     # sort the options
     if self.lang == u"en":
         options.sort()
     else:
         if PYTHON3:
             options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) )
         else:
             options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic )
     
     # remove replacements with single-letter words
     WL = len(tamil.utf8.get_letters(word))
     if WL > 3:
         options = filter( lambda x:  len(tamil.utf8.get_letters(x)) > 2, options )
     
     # remove dupes in list
     options2 = []
     prev = None
     for val in options:
         if val.strip() != prev:
             options2.append(val.strip())
         prev = val.strip()
     del options
     
     # score by Dice coefficients
     options_score = [0.0 for i in range(len(options2))]
     for itr,sugg_word in enumerate(options2):
         options_score[itr] = Dice_coeff( word, sugg_word )
     options = zip( options2, options_score)
     
     # limit options by score
     options = sorted(options,key=operator.itemgetter(1),reverse=True)
     options = [word_pair[0] for word_pair in options]
     
     L = 20
     # limit to first top -L=20 only which is good enough
     options = options[0:min(len(options),L)]
     
     return (False, options )
    def check_word_and_suggest(self, word):
        word = word.strip()
        # remove punctuation
        for x in string.punctuation:
            word = word.replace(x, u"")
        # remove digits
        word = re.sub(u"\d+", u"", word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()

        if not self.checklang(word):
            return (False, [u""])

        if len(word) < 1:
            return (False, [u""])

        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True, word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply(word)
        if self.isWord(word_nocase):
            return (True, word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word, TVU_dict)
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt), u"-".join(alt)]
            greedy_results.extend(alt)
        # return (False, results )
        else:
            greedy_results = list()

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter(TVU_dict.isWord, norvig_suggestor(word, self.alphabets, 2, limit=50))
        combinagram_suggests = list(tamil.wordutils.combinagrams(word, TVU_dict, limit=50))
        pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1]))

        # FIXME: score  the options
        options = greedy_results
        options.extend(list(norvig_suggests))
        options.extend(combinagram_suggests)
        options.extend(pfx_options)

        # sort the options
        if self.lang == u"en":
            options.sort()
        else:
            if PYTHON3:
                options = sorted(options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic))
            else:
                options = sorted(options, cmp=tamil.utf8.compare_words_lexicographic)

        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter(lambda x: len(tamil.utf8.get_letters(x)) > 2, options)

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options

        # score by Dice coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr, sugg_word in enumerate(options2):
            options_score[itr] = Dice_coeff(word, sugg_word)
        options = zip(options2, options_score)

        # limit options by score
        options = sorted(options, key=operator.itemgetter(1), reverse=True)
        options = [word_pair[0] for word_pair in options]

        L = 20
        # limit to first top -L=20 only which is good enough
        options = options[0 : min(len(options), L)]

        return (False, options)
Exemple #6
0
    def check_word_and_suggest( self,word, errmsg = None ):
        word = word.strip()
        # skip known punctuation at end of line
        while len(word) >= 1 and any(map(word.endswith,Speller.punctuation)):
            word = word[:-1]
        while len(word) >= 1 and any(map(word.startswith,string.whitespace)):
            word = word[1:]

        # is number then we propose a numeral
        if self.in_tamil_mode():
            numword = word.replace(u',',u'')
            if re.match(u'[+|-]*[\d]+',numword):
                try:
                    num = float(numword)
                    posnum = num
                    if num < 0:
                        posnum = -1*num
                    numeral_form = tamil.numeral.num2tamilstr(posnum)
                    if num < 0:
                        numeral_form = u"கழித்தல் "+numeral_form
                    return (False,[numeral_form])
                except Exception as ioe:
                    pass

            # dates are okay
            if any(map(word.endswith,[u"-இல்",u"-ஆம்",u"-இலிருந்து", u"-வரை"])):
                if re.search('^\d+',word):
                    return (True,[word]) #word is okay

            # check if words are transliterated
            if any(filter(lambda x: x in string.ascii_letters,tamil.utf8.get_letters(word))):
                # letter-sequence only
                en_word = Speller.scrub_ws(word)
                EN_Lexicon = Speller.get_english_dictionary()
                if EN_Lexicon.isWord(en_word):
                    return (False,['']) #English word - nosub- yet until we have parallel dictionaries or translation. TBD.

                #is english letter
                ta = algorithm.Iterative.transliterate(jaffna.Transliteration.table,en_word)
                # TBD: potential for having ANN to tell if english text is pure English word
                # or a romanized Tamil word. Output of classifier can be useful here.
                return (False,[ta])

            # check if it matches Tamil numeral and has close match.
            # propose suggestions from that list.
            # TBD

        # hyphens are not okay
        if word.find(u"-") >= 0:
            return (False,[word.replace(u"-",u" ")])#re.sub(u"^w"," ",word))
        # replace other spurious ()[] punctuations by concatenation
        #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word)))
        orig_word = u"%s"%word

        # remove digits
        word = re.sub(u'\d+',u'',word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()
        self.add_numeral_words(TVU_dict)

        # Check if this 'word' is any common kind of error
        if Typographical.checkFormErrors(word,errmsg):
            if errmsg: errmsg.append("TypographicalError")

        if not self.checklang(word):
            print("Word is not in desired language!")
            return (False,[u""])

        if len(word) < 1:
            print("Word is too small")
            return (False,[u''])

        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True,word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply( word )
        if ( self.isWord( word_nocase ) ):
            return (True,word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word,TVU_dict)
        greedy_results = list()
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt),u"-".join(alt)]
            greedy_results.extend(alt)
            #return (False, greedy_results )

        # if there are no other suggestions than deletion filter, we return
        # in presence of other suggestions we can just return suggestions
        suggs = DeletionFilter.get_suggestions(letters,TVU_dict)
        if len(suggs) > 0:
            if len(greedy_results) == 0:
                return (False,suggs)
            else:
                greedy_results.extend(suggs)

        # ottru splitting for Tamil language mode
        ottru_options = []
        if self.in_tamil_mode():
            # discover words like யாரிகழ்ந்து are accepted.
            ottru = OttruSplit(word,letters)
            ottru.run(TVU_dict)
            if len(ottru.results) > 0:
                return (True,word)
            ottru_options = ottru.results

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=25))
        combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=25))
        pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) )

        # FIXME: score  the options
        options = greedy_results
        options.extend( ottru_options )
        options.extend( list(norvig_suggests) )
        options.extend( combinagram_suggests )
        options.extend( pfx_options )

        # filter the options against a dictionary!
        options = filter(TVU_dict.isWord,options )
        if PYTHON3:
            options = list(options)

        if self.in_tamil_mode():
            options.extend( self.mayangoli_suggestions(orig_word,letters) )

        # sort the options
        if not self.in_tamil_mode():
            options.sort()
        else:
            if PYTHON3:
                options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) )
            else:
                options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic )

        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter( lambda x:  len(tamil.utf8.get_letters(x)) > 2, options )

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options
        if _DEBUG:
            print("@deduplication")
            pprint.pprint(options2)

        # score by Dice or Edit-Distance coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr,sugg_word in enumerate(options2):
            #options_score[itr] = Dice_coeff( word, sugg_word )
            options_score[itr] = (len(word)-edit_distance(word,sugg_word))/(1.0*len(orig_word))*Dice_coeff( word, sugg_word )/3.0 #dice coeff is weighted down
        options = zip( options2, options_score)

        # limit options by score
        options = sorted(options,key=operator.itemgetter(1),reverse=True)
        options = [word_pair[0] for word_pair in options]
        #L = 40
        # limit to first top -L=20 only which is good enough
        #options = options[0:min(len(options),L)]
        if _DEBUG:
            pprint.pprint("@after scoring/sorting")
            pprint.pprint(options)

        # eliminate single letter options
        options = filter(lambda x : not( x in tamil.utf8.tamil_letters), options)

        # Due to suggestion policy we may have words which are found in error but we dont have
        # replacements for them!

        # TBD: options should not have the 'word'!
        return (False, options )
Exemple #7
0
 def test_Norvig_suggestor(self):
     word = u"ஆங்கிலம்"
     opts1 = norvig_suggestor(word, None, 1)
     opts2 = norvig_suggestor(word, None, 2)
     self.assertEqual(list(map(len, [opts1, opts2])), [337, 55423])
     return
 def test_Norvig_suggestor( self ):
     word = u"ஆங்கிலம்"
     opts1 = norvig_suggestor( word, None, 1)
     opts2 = norvig_suggestor( word, None, 2)
     self.assertEqual( list( map(len,[opts1, opts2])),  [337,55423] )
     return