def test_lookup_compound(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(300000, results[0].count) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(23121323, results[0].count) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(3813904, results[0].count) typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes") correction = ("can you read this message despite the horrible " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(10, results[0].distance) self.assertEqual(6218089, results[0].count)
def test_lookup_compound_transfer_casing(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2) typo = ("Whereis th elove hehaD Dated forImuch of thepast who " "couqdn'tread in sixthgrade AND ins pired him") correction = ("Where is the love he haD Dated for much of the past " "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, transfer_casing=True) self.assertEqual(correction, results[0].term)
def load_symspell(): import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) return sym_spell
def test_lookup_compound(self): print(' - %s' % inspect.stack()[0][3]) cwd = os.path.realpath(os.path.dirname(__file__)) dictionary_path = os.path.realpath( os.path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes") correction = ("can you read this message despite the horrible " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
def initialize_models(): spacy_nlp = spacy.load("en_core_web_sm") dictionary_path = pkg_resources_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") sym_spell_len5 = SymSpell(max_dictionary_edit_distance=3, prefix_length=5) # term_index is the column of the term and count_index is the column of the term frequency sym_spell_len5.load_dictionary(dictionary_path, term_index=0, count_index=1) # The length of word prefixes used for spell checking. sym_spell_len7 = SymSpell(max_dictionary_edit_distance=4, prefix_length=7) # term_index is the column of the term and count_index is the column of the term frequency sym_spell_len7.load_dictionary(dictionary_path, term_index=0, count_index=1) c2v_model = load_c2v_model("single_word_trained_model") return spacy_nlp, c2v_model, sym_spell_len5, sym_spell_len7
def init_sym_spell(): from pathlib import Path from symspellpy import SymSpell from ds_tools.fs.paths import get_user_cache_dir sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=1) dict_path_pkl = Path( get_user_cache_dir('music_manager')).joinpath('words.pkl.gz') if dict_path_pkl.exists(): log.debug(f'Loading pickled spellcheck dictionary: {dict_path_pkl}') sym_spell.load_pickle(dict_path_pkl) else: import lzma import pkg_resources dict_path = pkg_resources.resource_filename( 'symspellpy', 'frequency_dictionary_en_82_765.txt') sym_spell.load_dictionary(dict_path, 0, 1) word_list_path_xz = Path( pkg_resources.resource_filename( 'music', '../../etc/scowl/words.xz')).resolve() log.debug( f'Loading default dictionary + word list from {word_list_path_xz}') with lzma.open(word_list_path_xz, 'rt', encoding='utf-8') as f: word_list = f.read().splitlines() loaded = sym_spell._words min_count = min(loaded.values()) add_word = sym_spell.create_dictionary_entry for word in word_list: try: loaded[word] except KeyError: add_word(word, min_count) fmt = 'Saving pickled spellcheck dictionary (this is a one-time action that may take about 15 seconds): {}' log.info(fmt.format(dict_path_pkl)) sym_spell.save_pickle(dict_path_pkl) return sym_spell
def load_name_corection(dictionary_path, bigram_path): sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) # dictionary_path = pkg_resources.resource_filename( # dictionary_path) # bigram_path = pkg_resources.resource_filename( # bigram_path) sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1, encoding='utf-8') sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2, encoding='utf-8') return sym_spell
def load_cpn_corection(companies_list, debug=False): with open(companies_list, 'r', encoding='utf-8') as f: l = f.read() l = l.lower() l = l.split('\n') m = [] for w in l: m.append(w.split()) bi = export_freq_bigram(m) uni = export_freq_dic(m) if debug: print(uni) print(bi) sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7) sym_spell.load_dictionary_from_list(uni, term_index=0, count_index=1) sym_spell.load_bigram_dictionary_from_list(bi, term_index=0, count_index=2) return sym_spell
def test_word_segmentation_with_arguments(self): edit_distance_max = 0 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = "thequickbrownfoxjumpsoverthelazydog" correction = "the quick brown fox jumps over the lazy dog" result = sym_spell.word_segmentation(typo, edit_distance_max, 11) self.assertEqual(correction, result.corrected_string) typo = "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen" correction = ("it was a bright cold day in april and the clocks " "were striking thirteen") result = sym_spell.word_segmentation(typo, edit_distance_max, 11) self.assertEqual(correction, result.corrected_string) typo = (" itwasthebestoftimesitwastheworstoftimesitwastheageofwisdom" "itwastheageoffoolishness") correction = ("it was the best of times it was the worst of times " "it was the age of wisdom it was the age of foolishness") result = sym_spell.word_segmentation(typo, edit_distance_max, 11) self.assertEqual(correction, result.corrected_string)
def test_lookup_compound_ignore_non_words(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2) typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who " "couqdn'tread in SIXTHgrade and ins pired him") correction = ("where is the love 123 he had dated for much of THEPAST " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the DHIRD 1 quarter of last year he had learned " "of a secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY " "of 12 funn") correction = ("the biggest players in the strong summer film slate " "with PLETY of 12 fun") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible 1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible 1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible AB1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible AB1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "PI on leave, arrange Co-I to do screening" correction = "PI on leave arrange co i to do screening" results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("is the officeon 1st floor oepn 24/7") correction = ("is the office on 1st floor open 24/7") results = sym_spell.lookup_compound(typo, edit_distance_max, split_phrase_by_space=True, ignore_non_words=True, ignore_any_term_with_digits=True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(2, results[0].distance) self.assertEqual(0, results[0].count)
def test_lookup_compound_replaced_words_no_bigram(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("whereas the love head dated for much of the past who " "couldn't read in sixth grade and inspired him") replacement_1 = { "whereis": "whereas", "th": "the", "elove": "love", "hehad": "head", "forimuch": "for much", "thepast": "the past", "couqdn'tread": "couldn't read", "sixthgrade": "sixth grade", "ins": "in" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(len(replacement_1), len(sym_spell.replaced_words)) for k, v in replacement_1.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") replacement_2 = { "te": "the", "dhird": "third", "qarter": "quarter", "oflast": "of last", "jear": "year", "hadlearned": "had learned", "ofca": "of a", "sekretplan": "secret plan" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2), len(sym_spell.replaced_words)) for k, v in replacement_2.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") replacement_3 = { "bigjest": "biggest", "playrs": "players", "strogsommer": "strong summer", "slatew": "slate", "ith": "with", "plety": "plenty", "funn": "fun" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2) + len(replacement_3), len(sym_spell.replaced_words)) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) for k, v in replacement_3.items(): self.assertEqual(v, sym_spell.replaced_words[k].term)
def test_negative_count_threshold(self): with pytest.raises(ValueError) as excinfo: __ = SymSpell(1, 3, -1) self.assertEqual("count_threshold cannot be negative", str(excinfo.value))
def test_negative_initial_capacity(self): print(' - %s' % inspect.stack()[0][3]) with pytest.raises(ValueError) as excinfo: __ = SymSpell(-16, 1, 3) self.assertEqual("initial_capacity cannot be negative", str(excinfo.value))
import nltk import csv import string from symspellpy import SymSpell, Verbosity import pkg_resources import pickle from nltk.sentiment.vader import SentimentIntensityAnalyzer with open('reviews_train.csv') as review_file: reader = list(csv.reader(review_file, delimiter=',')) documents = [[row[0], row[1], row[2]] for row in reader] result = [] spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") spell.load_dictionary(dictionary_path, term_index=0, count_index=1) negs = ['not', 'no', 'didnt'] for review in documents: last_neg = False neg_index = 0 for word in review[1].split(' '): if (len(word) > 0): if review[0].lower() not in word.lower(): word = word.translate(str.maketrans('', '', string.punctuation)) try: word = str( spell.lookup(word, Verbosity.CLOSEST,
def test_lookup_should_not_return_low_count_word(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("pawn", 1) result = sym_spell.lookup("pawn", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_delete_dictionary_entry_invalid_word(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("stea", 1) sym_spell.create_dictionary_entry("steama", 2) sym_spell.create_dictionary_entry("steem", 3) result = sym_spell.lookup("steama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term) self.assertEqual(len("steama"), sym_spell._max_length) self.assertFalse(sym_spell.delete_dictionary_entry("steamab")) result = sym_spell.lookup("steama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term) self.assertEqual(len("steama"), sym_spell._max_length)
import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename("symspellpy", "freq_name_dic.txt") bigram_path = pkg_resources.resource_filename("symspellpy", "freq_name_bigram.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1, encoding='utf-8') # sym_spell.load_dictionary('C:/Users/nt.anh6/PycharmProjects/aicr_vn/nlp_model/spell_checker/dict/vi_full.txt', term_index=0, count_index=1, encoding='utf-8') sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2, encoding='utf-8') # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = "Ngyễn tành nm" # max edit distance per lookup (per single word, not per whole input string) # suggestions = sym_spell.lookup(input_term, Verbosity.ALL, max_edit_distance=2, include_unknown=True) suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print(suggestion)
def test_lookup_should_not_return_low_count_word_that_are_also_delete_word(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("flam", Verbosity.TOP, 0) self.assertEqual(0, len(result))
class WordSuggester: """ Suggest words when the input is mispelled """ def __init__(self, ): d_print("Initializing the vocabulary set..") self.d = enchant.Dict("en_US") d_print("Initializing BERT pipeline..") self.tok = AutoTokenizer.from_pretrained("bert-base-uncased") self.bert = BertForMaskedLM.from_pretrained("bert-base-uncased") self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.sym_spell_cut = SymSpell(max_dictionary_edit_distance=0, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") # term_index is the column of the term and count_index is the # column of the term frequency self.sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) self.sym_spell_cut.load_dictionary(dictionary_path, term_index=0, count_index=1) def cross_word_validate(self, word, word_counts, min_counts=2): """ A word is considered valid if it occures many times or """ tot = sum(word_counts.values()) return word_counts[word] >= min_counts def is_multiword(self, word): suggestions = self.d.suggest(word) for sugg in suggestions: if "".join(sugg.split(" ")) == word: return True, sugg if "".join(sugg.split("-")) == word: return True, sugg.replace("-", " ") return False, "" def cross_sugg_validate(self, word, word_counts): suggestions = [ s.term for s in self.sym_spell.lookup( word, Verbosity.CLOSEST, max_edit_distance=2) ] present_words = { word: count for word, count in word_counts.items() if word in suggestions } if len(present_words) == 0: return False, "" corr_word = max(present_words.items(), key=operator.itemgetter(1))[0] return True, corr_word def get_word_suggestions(self, word, word_counts): """ Return the suggestions for the word passed in parameter. If the word passed in parameter is valid, return a list of len 1 with the word inside. Args: word (str): the word to find suggestions for word_counts (dict): value counts of word for a given emoji (context) """ # If the word appears many times in answers, we keep it if self.cross_word_validate(word, word_counts): return {"status": "present", "words": [word]} # If the word is part of the english vocabulary we keep it if self.d.check(word): return {"status": "exist", "words": [word]} # If the suggestions associated to the word appear in the rest of the answers # we keep the most common one cross_sugg, corr_word = self.cross_sugg_validate(word, word_counts) if cross_sugg: return {"status": "cross_suggested", "words": [corr_word]} # If the cutting of the word into several words is very confident, we disassemble it result = self.sym_spell_cut.word_segmentation(word) log_confidence = result.log_prob_sum / len(result.corrected_string) if log_confidence > -1: suggestions = result.corrected_string return { "status": "disassembled1", "words": [result.corrected_string] } # Same approach using another library is_multi, corr_word = self.is_multiword(word) if is_multi: return {"status": "disassembled2", "words": [corr_word]} # We use the other words as a context to select among the suggestions suggestions = [ sugg.term for sugg in self.sym_spell.lookup( word, Verbosity.CLOSEST, max_edit_distance=2) ] if len(suggestions) > 0: return {"status": "corrected", "words": suggestions} # The word is probably unknown return {"status": "notfound", "words": [word]} def get_context_suggestions(self, word_list): """ Applies get_word_suggestions for every word of an emoji's vocabulary (context) Args: word_list (list of str): words to describe the emoji Returns: [list of list of str]: list of suggestions: each word receives suggestions (list of str) """ word_counts = Counter(word_list) context_suggestions = [ self.get_word_suggestions(word, word_counts) for word in word_list ] return context_suggestions def find_best_word(self, context, suggestions): """ Find the most appropriate word in suggestions given the context Args: context (list of str): words defining the context suggestions (list of str): suggestions for the word to find Returns: [str]: the word of suggestions that matches the best the context according to BERT output """ # We place the word of interest in the middle of the context n = len(context) // 2 pre_context = " ".join(context[:n]) post_context = " ".join(context[n:]) sentence = f"{pre_context} {self.tok.mask_token} {post_context}" input_tokens = self.tok.encode(sentence) answer_pos = input_tokens.index(self.tok.mask_token_id) logits = self.bert(torch.tensor([input_tokens]))[0][0] logits = logits[answer_pos] suggestions_tokens = [ self.tok.encode(word)[1:-1] for word in suggestions ] scores = [ np.mean([logits[i].item() for i in tokens]) for tokens in suggestions_tokens ] best_sugg_idx = np.argmax(scores) return suggestions[best_sugg_idx] def extract_context_suggestions(self, context_suggestions): """ Extract best words for each suggestions in the context suggestions Args: context_suggestions (list of list of str): list of suggestions Returns: [list of str]: most appropriate words """ # we don't need the status in the current function context_suggestions = [sugg["words"] for sugg in context_suggestions] ret_words = [] for suggestions in context_suggestions: # single suggestion: the word is not ambiguous if len(suggestions) == 1: ret_words.append(suggestions[0]) else: # we gather the single words considered as healthy context = [ word_list[0] for word_list in context_suggestions if word_list != suggestions and len(word_list) == 1 ] word = self.find_best_word(context, suggestions) ret_words.append(word) return ret_words def process_context(self, context, verbose=False): """ Args: context (list of str): words Returns: [list of str]: corrected words """ if os.environ.get("DEBUG") is not None: d_print("Test --> test") d_print("Test --> test") return context context_suggestions = self.get_context_suggestions(context) corr_words = self.extract_context_suggestions(context_suggestions) if verbose: for word, suggestions, corr_word in zip(context, context_suggestions, corr_words): status = suggestions["status"] if status == "notfound": d_print(f"Nof found: {word}") elif status not in ["present", "exist"] and word != corr_word: d_print(f"Modified: {word} --> {corr_word} ({status})") return corr_words def correct_prod_df(self, form_df, debug=False): """ Correct inplace mispelled words of a dataframe in productions format """ grouped_df = form_df.groupby("emoji") # TODO: remove the limitation em_indexes = [(key, val) for key, val in grouped_df.groups.items()] for emoji, indexes in tqdm(em_indexes): group = grouped_df.get_group(emoji)["word"] words = group.to_list() corr_words = self.process_context(words, verbose=True) form_df["word"].loc[indexes] = corr_words
import pkg_resources from symspellpy import SymSpell, Verbosity #An average 5 letter word has about 3 million possible spelling errors within a maximum edit distance of 3 sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency def spell_corrector(input_term): sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) #input_term = ('The yougn boy finaly understod the diffrence betwen paralell and perpendcular.') #input_term = ("whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixtgrade and ins pired him") # max edit distance per lookup (per single word, not per whole input string) suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2) # display suggestion term, edit distance, and term frequency sent = [] for suggestion in suggestions: sent.append(suggestion) predicted_sentence = str(sent[0])
def test_negative_count_threshold(self): print(' - %s' % inspect.stack()[0][3]) with pytest.raises(ValueError) as excinfo: __ = SymSpell(16, 1, 3, -1) self.assertEqual("count_threshold cannot be negative", str(excinfo.value))
def test_negative_max_dictionary_edit_distance(self): print(' - %s' % inspect.stack()[0][3]) with pytest.raises(ValueError) as excinfo: __ = SymSpell(16, -1, 3) self.assertEqual("max_dictionary_edit_distance cannot be negative", str(excinfo.value))
def test_create_dictionary_invalid_path(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual( False, sym_spell.create_dictionary("invalid/dictionary/path.txt"))
def test_verbosity_should_control_lookup_results(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("steams", 2) sym_spell.create_dictionary_entry("steem", 3) result = sym_spell.lookup("steems", Verbosity.TOP, 2) self.assertEqual(1, len(result)) result = sym_spell.lookup("steems", Verbosity.CLOSEST, 2) self.assertEqual(2, len(result)) result = sym_spell.lookup("steems", Verbosity.ALL, 2) self.assertEqual(3, len(result))
def test_lookup_transfer_casing(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 4) result = sym_spell.lookup("Stream", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("Steam", result[0].term) sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 4) result = sym_spell.lookup("StreaM", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("SteaM", result[0].term) sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 4) result = sym_spell.lookup("STREAM", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("STEAM", result[0].term)
def test_lookup_should_not_return_low_count_word(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("pawn", 1) result = sym_spell.lookup("pawn", Verbosity.TOP, 0) self.assertEqual(0, len(result))
class SymSpellChecker: def __init__(self, config_loader: ConfigLoader, parser: Parser): self.__high_frequency_threshold = config_loader.get_high_frequency_threshold( ) self.__parser = parser self.__sym_spell_filtered_file_path = config_loader.get_sym_spell_filtered_file_path( ) self.__sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.__english_dictionary = set(words.words()) def __get_best_suggestion_term(self, suggestions, split_term): res = suggestions[0] for suggestion in suggestions: if suggestion._count > res._count: res = suggestion if suggestions[0]._term == split_term and suggestions[ 0]._count >= res._count * self.__high_frequency_threshold: return split_term return res._term def __load_sym_spell_dictionary(self, dictionary_file): self.__sym_spell.load_dictionary(dictionary_file, term_index=0, count_index=1, separator=',') def __sym_spell_lookup(self, input_term): return self.__sym_spell.lookup(input_term, Verbosity.ALL, max_edit_distance=1, include_unknown=True) def __sym_spell_step(self, input_file, output_file, filter_file): updated_dictionary = {} file = open(filter_file, "w") queries_file = open(input_file, "r") for line in queries_file: (input_term, frequency) = line.split(",") split_terms = input_term.split(" ") best_suggested_query = "" for split_term in split_terms: if len(split_term) > 2 and split_term.isalpha() \ and split_term not in self.__english_dictionary: suggestions = self.__sym_spell_lookup(split_term) suggestion = self.__get_best_suggestion_term( suggestions, split_term) else: suggestion = split_term if len(best_suggested_query) > 0: best_suggested_query += " " best_suggested_query += suggestion updated_dictionary[best_suggested_query] = updated_dictionary.get( best_suggested_query, 0) + int(frequency) if input_term != best_suggested_query: file.write(input_term + "," + best_suggested_query + "," + frequency) file.close() queries_file.close() self.__parser.write_dictionary_to_file(updated_dictionary, output_file) def run_sym_spell(self, iterations, input_file, output_file, sym_spell_dictionary_file): self.__load_sym_spell_dictionary(sym_spell_dictionary_file) file_name = self.__sym_spell_filtered_file_path self.__sym_spell_step(input_file, output_file, file_name + "1.csv") for i in range(iterations): self.__sym_spell_step(output_file, output_file, file_name + str(i + 2) + ".csv")
def test_negative_max_dictionary_edit_distance(self): with pytest.raises(ValueError) as excinfo: __ = SymSpell(-1, 3) self.assertEqual("max_dictionary_edit_distance cannot be negative", str(excinfo.value))
from stop_words import get_stop_words from nltk.stem.porter import PorterStemmer import re import nltk from nltk.tokenize import word_tokenize from language_detector import detect_language import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") if sym_spell.word_count: pass else: sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) ################################### #### sentence level preprocess #### ################################### # lowercase + base filter # some basic normalization def f_base(s): """ :param s: string to be processed :return: processed string: see comments in the source code for more info """ # normalization 1: xxxThis is a --> xxx. This is a (missing delimiter)
def test_lookup_compound_replaced_words(self): print(' - %s' % inspect.stack()[0][3]) cwd = os.path.realpath(os.path.dirname(__file__)) dictionary_path = os.path.realpath( os.path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") replacement_1 = { "whereis": "where is", "th": "the", "elove": "love", "hehad": "he had", "forimuch": "for much", "thepast": "the past", "couqdn'tread": "couldn't read", "sixthgrade": "sixth grade", "ins": "in" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(len(replacement_1), len(sym_spell.replaced_words)) for k, v in replacement_1.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") replacement_2 = { "te": "the", "dhird": "third", "qarter": "quarter", "oflast": "of last", "jear": "year", "hadlearned": "had learned", "ofca": "of a", "sekretplan": "secret plan" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2), len(sym_spell.replaced_words)) for k, v in replacement_2.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") replacement_3 = { "bigjest": "biggest", "playrs": "players", "strogsommer": "strong summer", "slatew": "slate", "ith": "with", "plety": "plenty", "funn": "fun" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2) + len(replacement_3), len(sym_spell.replaced_words)) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) for k, v in replacement_3.items(): self.assertEqual(v, sym_spell.replaced_words[k].term)