class SoundexTest(unittest.TestCase): def setUp(self): super(SoundexTest, self).setUp() self.instance = Soundex() def test_soundex(self): '''TEST: Soundex calculation''' self.assertEqual(self.instance.soundex('vasudev'), 'v231') self.assertEqual(self.instance.soundex('Rupert'), 'R163') self.assertEqual(self.instance.soundex(u'ಬೆಂಗಳೂರು'), u'ಬDNFQCPC') self.assertEqual(self.instance.soundex(u'बॆंगळूरु'), u'बDNFQCPC') self.assertEqual(self.instance.soundex(u'आम्र् फल्'), u'आNPMQ000') def test_compare(self): '''TEST: Soundex Comparison''' self.assertEqual(self.instance.compare('Bangalore', u'ಬೆಂಗಳೂರು'), -1) self.assertEqual(self.instance.compare(u'ಬೆಂಗಳೂರು', u'बॆंगळूरु'), 2) self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'बॆंगळूरु'), 0) self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'आम्र् फल्'), -1)
def soundex(word1, word2): """ See https://libindic.org/Soundex :param word1: :param word2: :return: """ if words_equal(word1, word2): return True comparator = Soundex() # Result of 1 means sounds the same if comparator.compare(word1, word2) == 1: return True return False
class BaseMalayalam: """ Malayalam Spell Checker class. """ Suggestion = namedtuple('Suggestion', 'word sound lev jac weight tag_list') def __init__(self): """ Initialize necessary resources. """ self.dictionary_file = open( os.path.join(os.path.dirname(__file__), 'data/ml_rootwords.txt')) self.dictionary = self.dictionary_file.readlines() self.dictionary_file.close() try: self.dictionary = marisa_trie.Trie( [x.strip().decode('utf-8') for x in self.dictionary]) except: self.dictionary = marisa_trie.Trie( [x.strip() for x in self.dictionary]) self.stemmer = Stemmer() self.inflector = inflector.Inflector(lang='ml') self.soundex = Soundex() self.syllabalizer = Syllabifier() self.ngrammer = Ngram() def check(self, word): """ Returns if a word is spelled correctly or not. """ root_word = self.stemmer.stem(word)[word]['stem'] if root_word in self.dictionary: return True else: return False def get_best_intermediate(self, word, input_word, intermediate_words, original_tag_list): """ Return the best intermediate form from those generated during stemming. Best intermediate term is the one for which maximum similarity is found. It is used to handle incorrect words getting unnecessarily stemmed as they are not present in dictionary. """ lev = [] sound = [] jac = [] weight = [] word_tags_map = {} selected_word = input_word highest_weight = 0 for intr_counter in range(len(intermediate_words)): intermediate_word = intermediate_words[intr_counter] lev_tmp, sound_tmp, jac_tmp, weight_tmp = self.compare( intermediate_word, word) lev.append(lev_tmp) sound.append(sound_tmp) jac.append(jac_tmp) weight.append(weight_tmp) word_tags_map[intermediate_word] = original_tag_list[:intr_counter] if len(weight) > 0: highest_weight = max(weight) position = weight.index(highest_weight) selected_word = intermediate_words[position] lev = lev[position] return word_tags_map, highest_weight, selected_word def get_unique(self, list_of_items): result = [] for item in list_of_items: if item not in result: result.append(item) return result def suggest(self, input_word, n=5): """ Returns n suggestions that is similar to word. """ stemmer_result = self.stemmer.stem(input_word)[input_word] input_word = stemmer_result['stem'] tag_list = stemmer_result['inflection'] first_char = input_word[0] if first_char == _characters[0]: prev_char = first_char else: prev_char_pos = _characters.index(first_char) - 1 prev_char = _characters[prev_char_pos] if first_char == _characters[-1]: next_char = first_char else: next_char_pos = _characters.index(first_char) + 1 next_char = _characters[next_char_pos] possible_words = self.dictionary.keys(first_char) +\ self.dictionary.keys(next_char) +\ self.dictionary.keys(prev_char) final = [] intermediate_words = [] original_tag_list = tag_list intermediate_words.append(input_word) for tag_counter in range(len(tag_list)): new_word = self.inflector.inflect(input_word, tag_list[-tag_counter - 1:]) intermediate_words.insert(0, new_word) for word in possible_words: lev, sound, jac, weight1 = self.compare(input_word, word) word_tags_map, highest_weight, selected_word =\ self.get_best_intermediate( word, input_word, intermediate_words, original_tag_list) tag_list = original_tag_list if highest_weight >= weight1 and selected_word != input_word: tag_list = word_tags_map[selected_word] weight = max(weight1, highest_weight) suggestion_item = Malayalam.Suggestion(word, sound, lev, jac, weight, tag_list) if weight > 50: final.append(suggestion_item) sorted_list = sorted(final, key=attrgetter('weight'), reverse=True)[:n] final_list = [] for item in sorted_list: word = item.word tag_list = item.tag_list try: inflected_form = self.inflector.inflect(word, tag_list) final_list.append(inflected_form) except: final_list.append(word) continue return self.get_unique(final_list) def levenshtein_distance(self, tokens1, tokens2): """ Takes two lists containing tokens of one word each and returns the levenshtein distance between them. """ if len(tokens1) < len(tokens2): return self.levenshtein_distance(tokens2, tokens1) if len(tokens2) == 0: return len(tokens1) previous_row = range(len(tokens2) + 1) for i, c1 in enumerate(tokens1): current_row = [i + 1] for j, c2 in enumerate(tokens2): # j+1 instead of j since previous_row and current_row are one # character longer insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 # than tokens2 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] def compare(self, word1, word2): """ Returns the similarity measure between two words. """ soundex_comparison = self.soundex.compare(word1, word2) tokens1 = self.syllabalizer.syllabify_ml(word1) tokens2 = self.syllabalizer.syllabify_ml(word2) levenshtein_distance = self.levenshtein_distance(tokens1, tokens2) ngram1 = self.ngrammer.letterNgram(word1, 1) ngram2 = self.ngrammer.letterNgram(word2, 1) total = ngram1 + ngram2 union = [] for counter in range(len(total)): item = total[counter] if item not in union: union.append(item) final = [x for x in ngram1 if x in ngram2] +\ [x for x in ngram2 if x in ngram1] intersection = [] for counter in range(len(final)): item = final[counter] if item not in intersection: intersection.append(item) jaccards = float(len(intersection)) / float(len(union)) if soundex_comparison == 1 or soundex_comparison == 0: weight = 100 elif levenshtein_distance <= 2 and jaccards > 0.5: weight = 75 + (1.5 * jaccards) elif levenshtein_distance < 5 and jaccards > 0.5: weight = 65 + (3 * jaccards) else: weight = 0 return levenshtein_distance, soundex_comparison, jaccards, weight def check_and_generate(self, word): """ Receives a word as input, checks if it is a valid word and returns the suggestions if it is not. Returns 0 along with suggestions if an incorrect word. Returns 1 along with blank list of suggestions if word in dictionary. Returns 2 along with blank list of suggestions if word is unique. """ status = self.check(word) if status: return {'status': 1, 'suggestions': []} else: suggestions = self.suggest(word) if suggestions: return {'status': 0, 'suggestions': suggestions} else: # If there were no suggestions, it means the word was not # similar to any of the existing root words. So, that was not a # mistake, but an intended insertion. Hence, it is deemed as a # valid word return {'status': 2, 'suggestions': []}
for i,j,k in zip(X,y,z): lis1.append(instance.compare(i,j)) lis2.append(instance.compare(i,k)) lis3.append(instance.compare(j,k)) ''' transformer = [] for i in a: for j,k in zip(X,y): if i == k: #print(i + '\t' + k) transformer.append(j) for i,j,k in zip(transformer,a,b): lis1.append(instance.compare(i,j)) lis2.append(instance.compare(i,k)) lis3.append(instance.compare(j,k)) filename = 'Transformer_soundex.txt' with open(filename, 'w', encoding='utf-8') as f: f.write("Hindi-Bhojpuri" + '\t\t' + "Hindi-predicted" + '\t\t'+ "Bhojpuri-predicted" + '\n') for a,b,c in zip(lis1, lis2,lis3): f.write(str(a) + '\t\t\t' + str(b) + '\t\t\t'+ str(c) + '\n') ''' for i in range(0, 101, 10): if i == 0: continue newX = X[: int(len(X) * (i/100))]
class BaseMalayalam: """ Malayalam Spell Checker class. """ Suggestion = namedtuple('Suggestion', 'word sound lev jac weight tag_list') def __init__(self): """ Initialize necessary resources. """ self.dictionary_file = open(os.path.join( os.path.dirname(__file__), 'data/ml_rootwords.txt')) self.dictionary = self.dictionary_file.readlines() self.dictionary_file.close() try: self.dictionary = marisa_trie.Trie([x.strip().decode('utf-8') for x in self.dictionary]) except: self.dictionary = marisa_trie.Trie( [x.strip() for x in self.dictionary]) self.stemmer = Stemmer() self.inflector = inflector.Inflector(lang='ml') self.soundex = Soundex() self.syllabalizer = Syllabifier() self.ngrammer = Ngram() def check(self, word): """ Returns if a word is spelled correctly or not. """ root_word = self.stemmer.stem(word)[word]['stem'] if root_word in self.dictionary: return True else: return False def get_best_intermediate(self, word, input_word, intermediate_words, original_tag_list): """ Return the best intermediate form from those generated during stemming. Best intermediate term is the one for which maximum similarity is found. It is used to handle incorrect words getting unnecessarily stemmed as they are not present in dictionary. """ lev = [] sound = [] jac = [] weight = [] word_tags_map = {} selected_word = input_word highest_weight = 0 for intr_counter in range(len(intermediate_words)): intermediate_word = intermediate_words[intr_counter] lev_tmp, sound_tmp, jac_tmp, weight_tmp = self.compare( intermediate_word, word) lev.append(lev_tmp) sound.append(sound_tmp) jac.append(jac_tmp) weight.append(weight_tmp) word_tags_map[intermediate_word] = original_tag_list[:intr_counter] if len(weight) > 0: highest_weight = max(weight) position = weight.index(highest_weight) selected_word = intermediate_words[position] lev = lev[position] return word_tags_map, highest_weight, selected_word def get_unique(self, list_of_items): result = [] for item in list_of_items: if item not in result: result.append(item) return result def suggest(self, input_word, n=5): """ Returns n suggestions that is similar to word. """ stemmer_result = self.stemmer.stem(input_word)[input_word] input_word = stemmer_result['stem'] tag_list = stemmer_result['inflection'] first_char = input_word[0] if first_char == _characters[0]: prev_char = first_char else: prev_char_pos = _characters.index(first_char) - 1 prev_char = _characters[prev_char_pos] if first_char == _characters[-1]: next_char = first_char else: next_char_pos = _characters.index(first_char) + 1 next_char = _characters[next_char_pos] possible_words = self.dictionary.keys(first_char) +\ self.dictionary.keys(next_char) +\ self.dictionary.keys(prev_char) final = [] intermediate_words = [] original_tag_list = tag_list intermediate_words.append(input_word) for tag_counter in range(len(tag_list)): new_word = self.inflector.inflect( input_word, tag_list[-tag_counter - 1:]) intermediate_words.insert(0, new_word) for word in possible_words: lev, sound, jac, weight1 = self.compare(input_word, word) word_tags_map, highest_weight, selected_word =\ self.get_best_intermediate( word, input_word, intermediate_words, original_tag_list) tag_list = original_tag_list if highest_weight >= weight1 and selected_word != input_word: tag_list = word_tags_map[selected_word] weight = max(weight1, highest_weight) suggestion_item = Malayalam.Suggestion( word, sound, lev, jac, weight, tag_list) if weight > 50: final.append(suggestion_item) sorted_list = sorted(final, key=attrgetter('weight'), reverse=True)[:n] final_list = [] for item in sorted_list: word = item.word tag_list = item.tag_list try: inflected_form = self.inflector.inflect(word, tag_list) final_list.append(inflected_form) except: final_list.append(word) continue return self.get_unique(final_list) def levenshtein_distance(self, tokens1, tokens2): """ Takes two lists containing tokens of one word each and returns the levenshtein distance between them. """ if len(tokens1) < len(tokens2): return self.levenshtein_distance(tokens2, tokens1) if len(tokens2) == 0: return len(tokens1) previous_row = range(len(tokens2) + 1) for i, c1 in enumerate(tokens1): current_row = [i + 1] for j, c2 in enumerate(tokens2): # j+1 instead of j since previous_row and current_row are one # character longer insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 # than tokens2 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] def compare(self, word1, word2): """ Returns the similarity measure between two words. """ soundex_comparison = self.soundex.compare(word1, word2) tokens1 = self.syllabalizer.syllabify_ml(word1) tokens2 = self.syllabalizer.syllabify_ml(word2) levenshtein_distance = self.levenshtein_distance(tokens1, tokens2) ngram1 = self.ngrammer.letterNgram(word1, 1) ngram2 = self.ngrammer.letterNgram(word2, 1) total = ngram1 + ngram2 union = [] for counter in range(len(total)): item = total[counter] if item not in union: union.append(item) final = [x for x in ngram1 if x in ngram2] +\ [x for x in ngram2 if x in ngram1] intersection = [] for counter in range(len(final)): item = final[counter] if item not in intersection: intersection.append(item) jaccards = float(len(intersection)) / float(len(union)) if soundex_comparison == 1 or soundex_comparison == 0: weight = 100 elif levenshtein_distance <= 2 and jaccards > 0.5: weight = 75 + (1.5 * jaccards) elif levenshtein_distance < 5 and jaccards > 0.5: weight = 65 + (3 * jaccards) else: weight = 0 return levenshtein_distance, soundex_comparison, jaccards, weight def check_and_generate(self, word): """ Receives a word as input, checks if it is a valid word and returns the suggestions if it is not. Returns 0 along with suggestions if an incorrect word. Returns 1 along with blank list of suggestions if word in dictionary. Returns 2 along with blank list of suggestions if word is unique. """ status = self.check(word) if status: return {'status': 1, 'suggestions': []} else: suggestions = self.suggest(word) if suggestions: return {'status': 0, 'suggestions': suggestions} else: # If there were no suggestions, it means the word was not # similar to any of the existing root words. So, that was not a # mistake, but an intended insertion. Hence, it is deemed as a # valid word return {'status': 2, 'suggestions': []}
class InexactSearch(object): """ This class provides methods for fuzzy searching using word distance as well as phonetics. """ def __init__(self): self.sx = Soundex() def _countCommon(self, shrtBigr, lngBigr, average): common = 0.0 for indexShrt, bigr in enumerate(shrtBigr): if bigr in lngBigr: indexLng = lngBigr.index(bigr) if indexLng == indexShrt: common += 1.0 else: dislocation = (indexLng - indexShrt) / average if dislocation < 0: dislocation *= -1 common += 1.0 - dislocation return common def _createBigram(self, string): bigram = [] for i in range(1, len(string)): bigram.append(string[i - 1:i + 1]) return bigram def bigram_average(self, str1, str2): """Return approximate string comparator measure (between 0.0 and 1.0) using bigrams. :param str1: string 1 for comparison :str1 type : str :param str2: string 2 for comparison :str2 type : str :returns: int score between 0.0 and 1.0 >>> score = bigram_avearage(str1, str2) 0.7 Bigrams are two-character sub-strings contained in a string. For example, 'peter' contains the bigrams: pe,et,te,er. This routine counts the number of common bigrams and divides by the average number of bigrams. The resulting number is returned. """ if str1 == str2: return 1 bigr1 = self._createBigram(str1) bigr2 = self._createBigram(str2) average = (len(bigr1) + len(bigr2)) / 2.0 common = 0.0 if len(bigr1) < len(bigr2): # Count using the shorter bigram list common = self._countCommon(bigr1, bigr2, average) else: common = self._countCommon(bigr2, bigr1, average) return common / average def compare(self, string1, string2): ''' Compare strings using soundex if not possible gives biggram avearage. :param str1: string 1 for comparison. :type str1: str. :param str2: string 2 for comparison :type str2: str. :returns: int score between 0.0 and 1.0 ''' weight = 0 if string1 == string2: return 1.0 soundex_match = self.sx.compare(string1, string2) if soundex_match == 1: weight = 0.9 if soundex_match == 2: weight = 0.8 if weight == 0: return self.bigram_average(string1, string2) return weight @servicemethod def search(self, text, key): '''Searches for the key in the given text. This function uses :method: `InexactSearch.compare` for doing approx search. :param text: text in which search has to be done. :type text: str. :param key: key which has to be searched :type key: str. :returns: A dictionary with words in the string as keys and the score against the key as the value ''' key = key.strip() words = text.split() search_results = {} for word in words: word = word.strip() search_results[word] = self.compare(word, key) return search_results