class SymSpellChecker(object): def __init__(self): self.checker = SymSpell(max_dictionary_edit_distance=2) self.checker.load_dictionary( '/home/citao/github/symspellpy/frequency_dictionary_en_82_765.txt', 0, 1) self.checker.load_bigram_dictionary( '/home/citao/github/symspellpy/frequency_bigramdictionary_en_243_342.txt', 0, 2) def correct(self, word): suggestions = self.checker.lookup(word, Verbosity.CLOSEST, max_edit_distance=2) for suggestion in suggestions: cor_word = suggestion.term logging.info('Spell check: [{}] -> [{}]'.format(word, cor_word)) return cor_word return word def correct_text(self, text): cor_list = [] for word in text.split(' '): suggestions = self.checker.lookup(word, Verbosity.CLOSEST, max_edit_distance=2) cor_flag = False for suggestion in suggestions: cor_word = suggestion.term cor_list.append(cor_word) cor_flag = True break if not cor_flag: cor_list.append(word) return ' '.join(cor_list)
def test_lookup_transfer_casing(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 4) result = sym_spell.lookup("Stream", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("Steam", result[0].term) sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 4) result = sym_spell.lookup("StreaM", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("SteaM", result[0].term) sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 4) result = sym_spell.lookup("STREAM", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("STEAM", result[0].term) sym_spell = SymSpell() sym_spell.create_dictionary_entry("i", 4) result = sym_spell.lookup("I", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("I", result[0].term)
def test_words_with_shared_prefix_should_retain_counts(self): sym_spell = SymSpell(1, 3) sym_spell.create_dictionary_entry("pipe", 5) sym_spell.create_dictionary_entry("pips", 10) result = sym_spell.lookup("pipe", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pipe", result[0].term) self.assertEqual(5, result[0].count) self.assertEqual("pips", result[1].term) self.assertEqual(10, result[1].count) result = sym_spell.lookup("pips", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count) result = sym_spell.lookup("pip", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count)
def test_lookup_should_not_return_non_word_delete(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("pawn", 10) result = sym_spell.lookup("paw", Verbosity.TOP, 0) self.assertEqual(0, len(result)) result = sym_spell.lookup("awn", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_lookup_should_not_return_non_word_delete(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("pawn", 10) result = sym_spell.lookup("paw", Verbosity.TOP, 0) self.assertEqual(0, len(result)) result = sym_spell.lookup("awn", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_add_additional_counts_should_increase_count(self): sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, 11) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(11, count) sym_spell.create_dictionary_entry(word, 3) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(11 + 3, count)
def test_add_additional_counts_should_not_overflow(self): sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, sys.maxsize - 10) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(sys.maxsize - 10, count) sym_spell.create_dictionary_entry(word, 11) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(sys.maxsize, count)
def test_verbosity_should_control_lookup_results(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("steams", 2) sym_spell.create_dictionary_entry("steem", 3) result = sym_spell.lookup("steems", Verbosity.TOP, 2) self.assertEqual(1, len(result)) result = sym_spell.lookup("steems", Verbosity.CLOSEST, 2) self.assertEqual(2, len(result)) result = sym_spell.lookup("steems", Verbosity.ALL, 2) self.assertEqual(3, len(result))
def test_lookup_should_replicate_noisy_results(self): print(' - %s' % inspect.stack()[0][3]) cwd = os.path.realpath(os.path.dirname(__file__)) dictionary_path = os.path.realpath( os.path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) query_path = os.path.join(cwd, "fortests", "noisy_query_en_1000.txt") edit_distance_max = 2 prefix_length = 7 verbosity = Verbosity.CLOSEST sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) test_list = [] with open(query_path, "r") as infile: for line in infile.readlines(): line_parts = line.rstrip().split(" ") if len(line_parts) >= 2: test_list.append(line_parts[0]) result_sum = 0 for phrase in test_list: result_sum += len( sym_spell.lookup(phrase, verbosity, edit_distance_max)) self.assertEqual(4945, result_sum)
def test_lookup_should_not_return_low_count_word_that_are_also_delete_word( self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("flam", Verbosity.TOP, 0) self.assertEqual(0, len(result))
class SymSpellModel(CandidateModelBase): """ Candidate model based on symspell algorithm. https://github.com/wolfgarbe/SymSpell """ def __init__( self, config: Config, ): self.sym_spell = SymSpell() self.config = config self.load_dictionary() def load_dictionary(self): if not self.config.DICTIONARY_PATH.is_file(): raise FileNotFoundError("Dictionary doesn't exists") self.sym_spell.load_dictionary(self.config.DICTIONARY_PATH, term_index=0, count_index=1) def get_candidates(self, word: str, n=float("inf")) -> List[str]: suggestions = self.sym_spell.lookup( word, self.config.verbosity, max_edit_distance=self.config.max_edit_distance) suggested_words = [] for i, suggestion in enumerate(suggestions): if i > n: break suggested_words.append(suggestion.term) return suggested_words
def test_lookup_max_edit_distance_too_large(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) with pytest.raises(ValueError) as excinfo: __ = sym_spell.lookup("flam", Verbosity.TOP, 3) self.assertEqual("Distance too large", str(excinfo.value))
def test_lookup_include_unknown(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("flam", Verbosity.TOP, 0, True) self.assertEqual(1, len(result)) self.assertEqual("flam", result[0].term)
def spell_checker(df, pickling=False): '''Takes a list of document strings and runs all substrings through SymSpell and replaces each with correctly spelled string using the dictionary (max Levenshtein distance=2). ''' df = df[df["Full review"].notna()] sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary("frequency_dictionary_en_82_765.txt", term_index=0, count_index=1) reviews = list(df["Full review"]) cleaned_reviews = [] for review in tqdm(reviews): cleaned_review = [] for word in review.split(): if len(word) > 5: word = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True, \ transfer_casing=True, ignore_token='([A-z]+)-([A-z]+)')[0]._term cleaned_review.append(word) cleaned_reviews.append(" ".join(cleaned_review)) df["Spell-checked review"] = cleaned_reviews if pickling==True: with open("spell_checked_data.pickle", "wb") as to_write: pickle.dump(df, to_write) return df
def f_typo(w_list): """ :param w_list: word list to be processed :return: w_list with typo fixed by symspell. words with no match up will be dropped """ sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") if sym_spell.word_count: pass else: sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) w_list_fixed = [] for word in w_list: suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3) if suggestions: w_list_fixed.append(suggestions[0].term) else: pass # do word segmentation, deprecated for inefficiency # w_seg = sym_spell.word_segmentation(phrase=word) # w_list_fixed.extend(w_seg.corrected_string.split()) return w_list_fixed
def test_delete_dictionary_entry_invalid_word(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("stea", 1) sym_spell.create_dictionary_entry("steama", 2) sym_spell.create_dictionary_entry("steem", 3) result = sym_spell.lookup("steama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term) self.assertEqual(len("steama"), sym_spell._max_length) self.assertFalse(sym_spell.delete_dictionary_entry("steamab")) result = sym_spell.lookup("steama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term) self.assertEqual(len("steama"), sym_spell._max_length)
def test_lookup_include_unknown(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("qwer", Verbosity.TOP, 0, True) self.assertEqual(1, len(result)) self.assertEqual("qwer", result[0].term)
def test_lookup_should_find_exact_match(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("streama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term)
def test_lookup_should_return_most_frequent(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("stream", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steamb", result[0].term) self.assertEqual(6, result[0].count)
def test_lookup_avoid_exact_match_early_exit(self): edit_distance_max = 2 sym_spell = SymSpell(edit_distance_max, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("24th", Verbosity.ALL, edit_distance_max, ignore_token=r"\d{2}\w*\b") self.assertEqual(1, len(result)) self.assertEqual("24th", result[0].term)
def test_deletes(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("stream", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steamb", result[0].term) self.assertEqual(6, result[0].count) self.assertTrue(len(sym_spell.deletes))
def test_load_dictionary_encoding(self): dictionary_path = os.path.join(self.fortests_path, "non_en_dict.txt") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1, encoding="utf-8") result = sym_spell.lookup("АБ", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("АБИ", result[0].term)
class NameChecker(object): def __init__(self, name_list): self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) for each_name in name_list: self.sym_spell.create_dictionary_entry(each_name, len(each_name.split(' '))) def get_name(self, name): suggestions = self.sym_spell.lookup(name, Verbosity.CLOSEST, max_edit_distance=2, transfer_casing=True) if suggestions is not None and len(suggestions) > 0: return suggestions[0].term return name
def test_load_dictionary_encoding(self): print(' - %s' % inspect.stack()[0][3]) cwd = os.path.realpath(os.path.dirname(__file__)) dictionary_path = os.path.realpath( os.path.join(cwd, "fortests", "non_en_dict.txt")) edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1, encoding="utf-8") result = sym_spell.lookup("АБ", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("АБИ", result[0].term)
def symspell_correction( misspelled): # not used because it is too expensive from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(83000, 2) dictionary_path = resdir + "frequency_dictionary_en_82_765.txt" if not sym_spell.load_dictionary(dictionary_path, 0, 1): return "" suggestions = sym_spell.lookup(misspelled, Verbosity.CLOSEST, 2) if suggestions: return sorted(suggestions, key=lambda x: x.count, reverse=True)[0].term return sorted(sym_spell.lookup_compound(misspelled, 2),\ key = lambda x: x.count,\ reverse = True)[0].term
def process_comments(comments_column): # Apostrophe expansion comments_column = comments_column.apply(lambda x: x.replace("’", "'")) comments_column = comments_column.apply(lambda x: expandContractions(x)) # Lowercase tweets comments_column = comments_column.apply(lambda x: x.lower()) # Remove url, hashtags, cashtags, twitter handles, and RT. Only words comments_column = comments_column.apply(lambda x: ' '.join( re.sub( r"(@[A-Za-z0-9]+)|^rt |(#[A-Za-z0-9]+) |(\w+:\/*\S+)|[^a-zA-Z\s]", "", x).split())) # Remove url token comments_column = comments_column.apply(lambda x: x.replace('url', '')) # Lemmatisation tokeniser = TweetTokenizer() wordnet_lemmatizer = WordNetLemmatizer() comments_column = comments_column.apply( lambda x: [word for word in tokeniser.tokenize(x)]) sym_spell = SymSpell() dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") sym_spell.load_dictionary(dictionary_path, 0, 1) # spell_checkers.create_dictionary("eng_dict.txt") print("Spell checker...") for i in range(len(comments_column)): try: if i == (len(comments_column) - 1) or i % 10000 == 0: print('%i out of %i' % (i, len(comments_column))) for j in range(len(comments_column[i])): suggestions = sym_spell.lookup(comments_column[i][j], Verbosity.CLOSEST, max_edit_distance=2) # suggestions = spell_checkers.get_suggestions(comments_column[i][j]) if suggestions: best_sugg = str(suggestions[0].split(',')[0].strip()) # best_sugg = str(suggestions[0]) comments_column[i][j] = best_sugg except: continue comments_column = comments_column.apply(lambda x: ' '.join( [wordnet_lemmatizer.lemmatize(word, pos="v") for word in x])) return comments_column
def useSymspell(self): self.originalText, self.errorText = FP().prepareFiles() originalSentencesList, errorSentencesList = EC().textToSentences( self.originalText, self.errorText) print(len(originalSentencesList), len(errorSentencesList)) speller = SymSpell() corpusPath = FP().definePathToCoprus() speller.create_dictionary(corpusPath, encoding='utf-8') processedWordsList = [] for sentence in errorSentencesList: sentenceWords = EC().sentencesToWords(sentence) for word in sentenceWords: suggestions = speller.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) for suggestion in suggestions: processedWordsList.append(suggestion.term) break print(len(processedWordsList)) self.useWordsMetrics(self.originalText, processedWordsList)
def spell_checker(self): '''Takes a list of document strings and runs all substrings through SymSpell and replaces each with correctly spelled string using the dictionary (max Levenshtein distance=2). ''' sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary("frequency_dictionary_en_82_765.txt", term_index=0, count_index=1) reviews = self.reviews cleaned_reviews = [] for review in tqdm(list_of_reviews): cleaned_review = [] for word in tqdm(review.split()): if len(word) > 5: word = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True, \ transfer_casing=True, ignore_token='([A-z]+)-([A-z]+)')[0]._term cleaned_review.append(word) cleaned_reviews.append(" ".join(cleaned_review)) self.reviews = cleaned_reviews
def test_lookup_should_replicate_noisy_results(self): query_path = os.path.join(self.fortests_path, "noisy_query_en_1000.txt") edit_distance_max = 2 prefix_length = 7 verbosity = Verbosity.CLOSEST sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) test_list = [] with open(query_path, "r") as infile: for line in infile.readlines(): line_parts = line.rstrip().split(" ") if len(line_parts) >= 2: test_list.append(line_parts[0]) result_sum = 0 for phrase in test_list: result_sum += len( sym_spell.lookup(phrase, verbosity, edit_distance_max)) self.assertEqual(4945, result_sum)
def replace(self): ''' Generates a new text file by correcting the spellings''' sym_spell = SymSpell(max_dictionary_edit_distance=self.maxd, prefix_length=self.prefix_len) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) txtfile_corrected = self.txtfile for line in txtfile_corrected: for word in line: if (word.isalpha() and len(word) > 1): input_term = word suggestions = sym_spell.lookup(input_term, Verbosity.TOP, max_edit_distance=self.maxd, transfer_casing=True, include_unknown=True) line[line.index(word)] = suggestions[0].term write_file(self.new_name, txtfile_corrected)