def main(): initial_capacity = 83000 # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 0 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join( os.path.dirname(__file__), "./data/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return edit_distance_max = 0 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = "thequickbrownfoxjumpsoverthelazydog" correction = "the quick brown fox jumps over the lazy dog" result = sym_spell.word_segmentation(typo) # create object # a sentence without any spaces input_term = "thequickbrownfoxjumpsoverthelazydog" result = sym_spell.word_segmentation(input_term) # display suggestion term, term frequency, and edit distance print("{}, {}, {}".format(result.corrected_string, result.distance_sum, result.log_prob_sum))
def initializeSymspell(): print("inside initializeSymspell()") symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) print("symspell created") resourceNames = [ "symspellpy", "frequency_dictionary_en_82_765.txt", "frequency_bigramdictionary_en_243_342.txt" ] dictionaryPath = pkg_resources.resource_filename(resourceNames[0], resourceNames[1]) bigramPath = pkg_resources.resource_filename(resourceNames[0], resourceNames[2]) print("dictionaryPath created") symspell.load_dictionary(dictionaryPath, 0, 1) symspell.create_dictionary_entry(key='ap', count=500000000) print(list(islice(symspell.words.items(), 5))) print("symspell.load_ditionary() done") symspell.load_bigram_dictionary(bigramPath, 0, 1) print(list(islice(symspell.bigrams.items(), 5))) print("symspell.load_bigram_ditionary() done") # Create vocab vocab = set([w for w, f in symspell.words.items()]) return symspell, vocab
def spell_correction(texte): max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = "../ressources/fr-100k.txt" bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") if not sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") return if not sym_spell.load_bigram_dictionary( bigram_path, term_index=0, count_index=2): print("Bigram dictionary file not found") return input_term = texte # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) if (len(suggestions) > 0): return suggestions[0].term else: print("error with : ", texte) return texte
def __init__(self, lm, max_ed=4, prefix_length=7, l=1, channel_method_poisson=True, channel_prob_param=0.02): self.show_progress = False self.lm = lm self.l = l self.channel_method_poisson = channel_method_poisson self.channel_prob_param = channel_prob_param self.sym_spell = SymSpell(max_ed, prefix_length) if isinstance(self.lm, GPT2LMHeadModel): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.lm_sent_logscore = self.gpt2_sent_logscore self.beam_init = self.beam_GPT_init self.skipstart = 1 self.skipend = -1 self.update_sentence_history = self.updateGPT2history self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') for subword in range(self.tokenizer.vocab_size): self.sym_spell.create_dictionary_entry(key=self.tokenizer.decode(subword), count=1) else: self.lm_sent_logscore = self.ngram_sent_logscore self.beam_init = self.beam_ngram_init self.skipstart = self.lm.order-1 self.skipend = None self.update_sentence_history = self.updatengramhistory self.tokenizer = ngramTokenizer(self.lm) for word in lm.vocab: self.sym_spell.create_dictionary_entry(key=word, count=self.lm.counts[word])
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/dictionary/dictionary.txt") # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "bangeeet" # misspelling # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 3 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 3 f = open("note.html", "r") noteString = f.read() noteString = stripHTML(noteString) print(noteString) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him. But who aree yooui to say its not. I am.") tstart = datetime.now() suggestions = sym_spell.lookup_compound(noteString, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) tend = datetime.now() time = tend - tstart print(time.seconds)
def __init__(self, max_dictionary_edit_distance=2, prefix_length=7, dictionary_path=None): # maximum edit-distance for doing lookups self.max_dictionary_edit_distance = max_dictionary_edit_distance # Length of word prefixes used for spell checking self.prefix_length = prefix_length # create object self.sym_spell = SymSpell( max_dictionary_edit_distance=self.max_dictionary_edit_distance, prefix_length=self.prefix_length) # load dictionary if dictionary_path is None: dictionary_path = os.path.join( os.path.dirname('__file__'), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index): print('Dictionary file not found')
def setup(initial_capacity=83000, prefix_length=7, max_edit_distance_dictionary=2): global maximum_edit_distance maximum_edit_distance = max_edit_distance_dictionary dict_path = '/home/fa6/data/symspellpy/frequency_dictionary_en_82_765.txt' sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length, count_threshold=30) term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dict_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings # input_term = "memebers" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) # max_edit_distance_lookup = 2 # suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL # suggestions = sym_spell.lookup(input_term, suggestion_verbosity, # max_edit_distance_lookup) # # display suggestion term, term frequency, and edit distance # for suggestion in suggestions: # print("{}, {}, {}".format(suggestion.term, suggestion.count, # suggestion.distance)) return sym_spell
def symspell(max_edit_distance_dictionary: int = 2, prefix_length: int = 7, term_index: int = 0, count_index: int = 1, top_k: int = 10, **kwargs): """ Load a symspell Spell Corrector for Malay. Returns ------- result: malaya.spell.Symspell class """ try: from symspellpy.symspellpy import SymSpell, Verbosity except BaseException: raise ModuleNotFoundError( 'symspellpy not installed. Please install it and try again.') path = check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs) sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) sym_spell.load_dictionary(path['model'], term_index, count_index) path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) with open(path['model']) as fopen: corpus = json.load(fopen) return Symspell(sym_spell, Verbosity.ALL, corpus, k=top_k)
def spelling_preprocessor(): import os from symspellpy.symspellpy import SymSpell, Verbosity max_edit_distance_dictionary = 2 prefix_length = 7 sc = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.join(os.getenv('HOME'), 'symspellpy/symspellpy/frequency_dictionary_en_82_765.txt') term_index = 0 count_index = 1 if not sc.load_dictionary(dictionary_path, term_index, count_index): raise ImportError('Unable to load spelling dictionary') max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST @string_check def checker(s): words = s.split() corrected_words = list() for word in words: correction = sc.lookup(word, suggestion_verbosity, max_edit_distance_lookup) if correction: corrected_words.append(correction[0].term) else: corrected_words.append(word) return ' '.join(corrected_words) return checker
def createSymSpell(dict='ru-100k.txt', encoding='utf-8'): symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=5) symspell.load_dictionary(dict, encoding=encoding, term_index=0, count_index=1) return symspell
def create_context_speller(): """Creates a context speller, which uses the context frequency lookup table""" # Initialize Context Symspell Checker context_sym_spell = SymSpell(83000, 2, 7) # load dictionary lookup_path = os.path.join(os.path.dirname( __file__), "./data/dict/context_dist_small.txt") if not context_sym_spell.load_dictionary(lookup_path, 0, 1): raise Exception("Dictionary file not found") # Creates the spell checker def check_spell(word): suggestions = context_sym_spell.lookup(word, Verbosity.CLOSEST, 2) if len(suggestions) == 0: # Not in context return True else: correct = True for suggestion in suggestions: if suggestion.distance == 1: correct = False return correct return check_spell
def test_words_from_list_with_shared_prefix_should_retain_counts(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 1, 3, words=[ "pipe", "pipe", "pipe", "pipe", "pipe", "pips", "pips", "pips", "pips", "pips", "pips", "pips", "pips", "pips", "pips" ]) result = sym_spell.lookup("pipe", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pipe", result[0].term) self.assertEqual(5, result[0].count) self.assertEqual("pips", result[1].term) self.assertEqual(10, result[1].count) result = sym_spell.lookup("pips", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count) result = sym_spell.lookup("pip", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count)
def correct_spelling(sentence): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 5 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return if "& ;" in sentence: sentence = sentence.replace("& ;", "and") max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(sentence, max_edit_distance_lookup) save = "" for suggestion in suggestions: save = suggestion.term #print("{}".format(save)) break #if "#" in save: # save = sym_spell.word_segmentation(save) return save
def test_words_with_shared_prefix_should_retain_counts(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 1, 3) sym_spell.create_dictionary_entry("pipe", 5) sym_spell.create_dictionary_entry("pips", 10) result = sym_spell.lookup("pipe", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pipe", result[0].term) self.assertEqual(5, result[0].count) self.assertEqual("pips", result[1].term) self.assertEqual(10, result[1].count) result = sym_spell.lookup("pips", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count) result = sym_spell.lookup("pip", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count)
def getSymspellDict(direc): print("loading symspell object") sym_spell = SymSpell(83000, 2, 7) if not sym_spell.load_dictionary(direc, 0, 1): print("Dictionary file not found") return sym_spell
def symspell(max_edit_distance_dictionary: int = 2, prefix_length: int = 7, term_index: int = 0, count_index: int = 1, top_k: int = 10, **kwargs): """ Train a symspell Spell Corrector. Returns ------- result: malaya.spell.SYMSPELL class """ check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs) check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) try: from symspellpy.symspellpy import SymSpell, Verbosity except: raise Exception( 'symspellpy not installed. Please install it and try again.') sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = PATH_NGRAM['symspell']['model'] sym_spell.load_dictionary(dictionary_path, term_index, count_index) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return SYMSPELL(sym_spell, Verbosity.ALL, corpus, k=top_k)
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return sym_spell.load_dictionary( "/home/yadi/projectDISK/Python-Projects/ML-NLP/dictionary.txt", 0, 1) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him." "I'm workig in th e yadolah shahrary working in githib") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 1 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup, transfer_casing=True) # display suggestion term, edit distance, and term frequency print(input_term) for suggestion in suggestions: print("{}".format(suggestion.term))
def __init__(self, train=False, save=False, corpus_path=CORPUS_PATH, threshold=2): self.slang_dict = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_slang_words.p"), "rb")) self.slang_dict['dr'] = 'dari' self.slang_dict['k'] = 'ke' self.slang_dict['sc'] = 'sesar' if train: create_dictionary.main() self.words = self.__words(corpus_path) self.counter = self.__counter(self.words) self.model = model.LanguageModel(corpus_path=corpus_path) else: self.words = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_spell_words.p"), "rb")) self.counter = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_spell_counter.p"), "rb")) self.model = model.LanguageModel(load=True) try: for key in self.counter: if self.counter[key] <= threshold: self.words.remove(key) except: pass self.candidates_dict = {} # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) self.factory = StemmerFactory() self.stemmer = self.factory.create_stemmer() # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/dictionary/dictionary.txt") # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary( dictionary_path, term_index, count_index, encoding="utf-8"): print("Dictionary file not found") return if save == True: self.save()
def correctly_spelled(data, max_edit_distance_lookup=None): global sym_speller # Make the SymspellPy-based speller global to be able to be used in the body of this function if sym_speller is None: # If the speller is not initialized sym_speller = SymSpell( max_edit_distance_dictionary, prefix_length) # Initialize the speller provided its parameters as # previously defined sym_spell_dict_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt" ) # Load the frequency dictionary # to the speller term_index = 0 # Column of the term in the dictionary text file count_index = 1 # Column of the term frequency in the dictionary text file if not sym_speller.load_dictionary( sym_spell_dict_path, term_index, count_index): # If the dictionary was not found print("ERROR! SymSpellPy dictionary not found at following path:", sym_spell_dict_path ) # Print error message informing about this os._exit(1) # Exit the entire program if max_edit_distance_lookup is None: # If no maximum edit distance during lookup is specified max_edit_distance_lookup = max_edit_distance_dictionary # Assign the same edit distance to that as to the maximum edit distance # on the dictionary # Correct spelling of each token in the text and return the data sample return " ".join([ (sym_speller.lookup_compound(t, max_edit_distance_lookup)[0].term if t.isalpha() and not (t == data[0] or t == data[1] or ("".join([x[0] for x in data[1].split()]) == t if len(data[1].split()) >= 3 else False)) else t) for t in tokenized(data[2]) ])
def spelling_correction(data,column): from symspellpy.symspellpy import SymSpell , Verbosity # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = "frequency_dictionary_en_82_765.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL df_final = pd.DataFrame() for index , row in data.iterrows(): # lookup suggestions for single-word input strings text = row[column] # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) for input_term in text.split(): suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) if len(suggestions)>0: df_local = pd.DataFrame({'Original Word':[input_term],'Replacement':[suggestions[0].term]}) df_final = df_final.append(df_local) return df_final
def initialize(self): print("Initializing Text Cleaner..") print("Initializing Smart Contractions Module..") self.cont = Contractions(self.embedding_for_smart_contraction) self.cont.load_models() print("Initializing Stopwords Module..") self.stop_words = set(stopwords.words('english')) stop_words_without_negation = copy.deepcopy(self.stop_words) stop_words_without_negation.remove('no') stop_words_without_negation.remove('nor') stop_words_without_negation.remove('not') self.stop_words_without_negation = stop_words_without_negation self.pos_tags_set_1 = {'NNP'} print("Initializing Wordnet Lemmatizer Module..") self.wnl = WordNetLemmatizer() print("Initializing Spellcheck Module..") max_edit_distance_dictionary = 2 prefix_length = 7 self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath self.sym_spell.load_dictionary(dictionary_path, 0, 1) print("Initialization complete!")
def _create_symspell_checker(self, language: AnyStr) -> SymSpell: """Private method to create a SymSpell instance for a given language Args: language: Language code in ISO 639-1 format Returns: SymSpell checker instance loaded with the language dictionary """ start = perf_counter() logging.info(f"Loading spellchecker for language '{language}'...") symspell_checker = SymSpell( max_dictionary_edit_distance=self.edit_distance) frequency_dict_path = self.dictionary_folder_path + "/" + language + ".txt" symspell_checker.load_dictionary(frequency_dict_path, term_index=0, count_index=1, encoding="utf-8") if len(self.custom_vocabulary_set) != 0: for word in self.custom_vocabulary_set: symspell_checker.create_dictionary_entry(key=word, count=1) logging.info( f"Loading spellchecker for language '{language}': done in {perf_counter() - start:.2f} seconds" ) return symspell_checker
def test_lookup_should_replicate_noisy_results(self): print(' - %s' % inspect.stack()[0][3]) cwd = path.realpath(path.dirname(__file__)) dictionary_path = path.realpath( path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) query_path = path.join(cwd, "fortests", "noisy_query_en_1000.txt") edit_distance_max = 2 prefix_length = 7 verbosity = Verbosity.CLOSEST sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) test_list = [] with open(query_path, "r") as infile: for line in infile.readlines(): line_parts = line.rstrip().split(" ") if len(line_parts) >= 2: test_list.append(line_parts[0]) result_sum = 0 for phrase in test_list: result_sum += len( sym_spell.lookup(phrase, verbosity, edit_distance_max)) self.assertEqual(4945, result_sum)
def load_spell_checker(): """Return spell checker""" if not os.path.exists("data/unigrams.txt"): sents = [normalize_text(" ".join(x)).split() for x in floresta.sents()] sents += [normalize_text(" ".join(x)).split() for x in machado.sents()] sents += [ normalize_text(" ".join(x)).split() for x in mac_morpho.sents() ] unigrams = [item for sublist in sents for item in sublist] unigrams = nltk.probability.FreqDist(unigrams) file = open("data/unigrams.txt", "w") for k, v in unigrams.items(): file.write(f"{k} {v}\n") file.close() bigrams = [] for sent in sents: bigrams += list(nltk.bigrams(sent)) bigrams = nltk.probability.FreqDist(bigrams) file = open("data/bigrams.txt", "w") for k, v in bigrams.items(): file.write(f"{' '.join(k)} {v}\n") file.close() result = SymSpell() result.load_dictionary("data/unigrams.txt", 0, 1) result.load_bigram_dictionary("data/bigrams.txt", 0, 2) return result
def test_lookup_should_not_return_non_word_delete(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("pawn", 10) result = sym_spell.lookup("paw", Verbosity.TOP, 0) self.assertEqual(0, len(result)) result = sym_spell.lookup("awn", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def load_symspell(dict_path='symspell/frequency_dictionary_en_82_765.txt', max_edit_distance_dictionary=2, prefix_length=7, term_index=0, count_index=1): sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) sym_spell.load_dictionary(dict_path, term_index, count_index) return sym_spell
def __init__(self, progress, directory, countries_dict): self.progress = progress self.logger = logging.getLogger(__name__) self.spelling_update = Counter() self.directory = directory self.spell_path = os.path.join(self.directory, 'spelling.pkl') self.countries_dict = countries_dict self.sym_spell = SymSpell()
def symspell_dict(max_edit_dist, prefix_len): dictfile = DICT_DIR / "big.txt" #downloaded from Peter Norvig's site sym_spell = SymSpell(max_edit_dist, prefix_len) #create the symspell dictionary using the dictfile if not sym_spell.create_dictionary(str(dictfile)): print("corpus file not found") return sym_spell
def test_lookup_should_not_return_low_count_word_that_are_also_delete_word( self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("flam", Verbosity.TOP, 0) self.assertEqual(0, len(result))