def main(): initial_capacity = 83000 # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 0 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join( os.path.dirname(__file__), "./data/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return edit_distance_max = 0 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = "thequickbrownfoxjumpsoverthelazydog" correction = "the quick brown fox jumps over the lazy dog" result = sym_spell.word_segmentation(typo) # create object # a sentence without any spaces input_term = "thequickbrownfoxjumpsoverthelazydog" result = sym_spell.word_segmentation(input_term) # display suggestion term, term frequency, and edit distance print("{}, {}, {}".format(result.corrected_string, result.distance_sum, result.log_prob_sum))
def initializeSymspell(): print("inside initializeSymspell()") symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) print("symspell created") resourceNames = [ "symspellpy", "frequency_dictionary_en_82_765.txt", "frequency_bigramdictionary_en_243_342.txt" ] dictionaryPath = pkg_resources.resource_filename(resourceNames[0], resourceNames[1]) bigramPath = pkg_resources.resource_filename(resourceNames[0], resourceNames[2]) print("dictionaryPath created") symspell.load_dictionary(dictionaryPath, 0, 1) symspell.create_dictionary_entry(key='ap', count=500000000) print(list(islice(symspell.words.items(), 5))) print("symspell.load_ditionary() done") symspell.load_bigram_dictionary(bigramPath, 0, 1) print(list(islice(symspell.bigrams.items(), 5))) print("symspell.load_bigram_ditionary() done") # Create vocab vocab = set([w for w, f in symspell.words.items()]) return symspell, vocab
class SymSpellCorrection: """ Use SymSpell for correction """ def __init__(self, dictionary_path, term_index=0, count_index=1, max_edit_distance_dictionary=0, prefix_length=7, **args): """ Input: - dictionary_path: string - term_index: int, column of the term in the dictionary text file, default is 0 - count_index: int, column of the term frequency in the dictionary text file, default is 1 - max_edit_distance_dictionary: int, maximum edit distance per dictionary precalculation, default is 0 - prefix_length, int, default is 7 """ from symspellpy.symspellpy import SymSpell self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) self.sym_spell.load_dictionary(dictionary_path, term_index, count_index) def __call__(self, sentence): """ Input: - sentence: string Output: - string """ if len(sentence) < 1: return sentence try: corrected = self.sym_spell.word_segmentation(sentence).corrected_string except: print("Error spell correction:", sentence) corrected = sentence return corrected
def _create_symspell_checker(self, language: AnyStr) -> SymSpell: """Private method to create a SymSpell instance for a given language Args: language: Language code in ISO 639-1 format Returns: SymSpell checker instance loaded with the language dictionary """ start = perf_counter() logging.info(f"Loading spellchecker for language '{language}'...") symspell_checker = SymSpell( max_dictionary_edit_distance=self.edit_distance) frequency_dict_path = self.dictionary_folder_path + "/" + language + ".txt" symspell_checker.load_dictionary(frequency_dict_path, term_index=0, count_index=1, encoding="utf-8") if len(self.custom_vocabulary_set) != 0: for word in self.custom_vocabulary_set: symspell_checker.create_dictionary_entry(key=word, count=1) logging.info( f"Loading spellchecker for language '{language}': done in {perf_counter() - start:.2f} seconds" ) return symspell_checker
def symspell(max_edit_distance_dictionary: int = 2, prefix_length: int = 7, term_index: int = 0, count_index: int = 1, top_k: int = 10, **kwargs): """ Train a symspell Spell Corrector. Returns ------- result: malaya.spell.SYMSPELL class """ check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs) check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) try: from symspellpy.symspellpy import SymSpell, Verbosity except: raise Exception( 'symspellpy not installed. Please install it and try again.') sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = PATH_NGRAM['symspell']['model'] sym_spell.load_dictionary(dictionary_path, term_index, count_index) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return SYMSPELL(sym_spell, Verbosity.ALL, corpus, k=top_k)
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return sym_spell.load_dictionary( "/home/yadi/projectDISK/Python-Projects/ML-NLP/dictionary.txt", 0, 1) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him." "I'm workig in th e yadolah shahrary working in githib") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 1 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup, transfer_casing=True) # display suggestion term, edit distance, and term frequency print(input_term) for suggestion in suggestions: print("{}".format(suggestion.term))
def load_spell_checker(): """Return spell checker""" if not os.path.exists("data/unigrams.txt"): sents = [normalize_text(" ".join(x)).split() for x in floresta.sents()] sents += [normalize_text(" ".join(x)).split() for x in machado.sents()] sents += [ normalize_text(" ".join(x)).split() for x in mac_morpho.sents() ] unigrams = [item for sublist in sents for item in sublist] unigrams = nltk.probability.FreqDist(unigrams) file = open("data/unigrams.txt", "w") for k, v in unigrams.items(): file.write(f"{k} {v}\n") file.close() bigrams = [] for sent in sents: bigrams += list(nltk.bigrams(sent)) bigrams = nltk.probability.FreqDist(bigrams) file = open("data/bigrams.txt", "w") for k, v in bigrams.items(): file.write(f"{' '.join(k)} {v}\n") file.close() result = SymSpell() result.load_dictionary("data/unigrams.txt", 0, 1) result.load_bigram_dictionary("data/bigrams.txt", 0, 2) return result
def symspell(max_edit_distance_dictionary: int = 2, prefix_length: int = 7, term_index: int = 0, count_index: int = 1, top_k: int = 10, **kwargs): """ Load a symspell Spell Corrector for Malay. Returns ------- result: malaya.spell.Symspell class """ try: from symspellpy.symspellpy import SymSpell, Verbosity except BaseException: raise ModuleNotFoundError( 'symspellpy not installed. Please install it and try again.') path = check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs) sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) sym_spell.load_dictionary(path['model'], term_index, count_index) path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) with open(path['model']) as fopen: corpus = json.load(fopen) return Symspell(sym_spell, Verbosity.ALL, corpus, k=top_k)
def test_lookup_should_replicate_noisy_results(self): print(' - %s' % inspect.stack()[0][3]) cwd = path.realpath(path.dirname(__file__)) dictionary_path = path.realpath( path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) query_path = path.join(cwd, "fortests", "noisy_query_en_1000.txt") edit_distance_max = 2 prefix_length = 7 verbosity = Verbosity.CLOSEST sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) test_list = [] with open(query_path, "r") as infile: for line in infile.readlines(): line_parts = line.rstrip().split(" ") if len(line_parts) >= 2: test_list.append(line_parts[0]) result_sum = 0 for phrase in test_list: result_sum += len( sym_spell.lookup(phrase, verbosity, edit_distance_max)) self.assertEqual(4945, result_sum)
def createSymSpell(dict='ru-100k.txt', encoding='utf-8'): symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=5) symspell.load_dictionary(dict, encoding=encoding, term_index=0, count_index=1) return symspell
def load_symspell(dict_path='symspell/frequency_dictionary_en_82_765.txt', max_edit_distance_dictionary=2, prefix_length=7, term_index=0, count_index=1): sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) sym_spell.load_dictionary(dict_path, term_index, count_index) return sym_spell
def symspell_checker(text): from symspellpy.symspellpy import SymSpell spell = SymSpell() spell.load_dictionary(r"frequency_dictionary_en_82_765.txt", 0, 1) spell.load_bigram_dictionary(r"frequency_bigramdictionary_en_243_342.txt", 0, 2) result = spell.lookup_compound(text, 2) for r in result: return r.term return text
def symspell( validate=True, max_edit_distance_dictionary=2, prefix_length=7, term_index=0, count_index=1, top_k=10, ): """ Train a symspell Spell Corrector. Parameters ---------- validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- _SpellCorrector: malaya.spell._SymspellCorrector class """ if not isinstance(validate, bool): raise ValueError('validate must be a boolean') if not isinstance(max_edit_distance_dictionary, int): raise ValueError('max_edit_distance_dictionary must be an integer') if not isinstance(prefix_length, int): raise ValueError('prefix_length must be an integer') if not isinstance(term_index, int): raise ValueError('term_index must be an integer') if not isinstance(count_index, int): raise ValueError('count_index must be an integer') if validate: check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell']) check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1]) else: if not check_available(PATH_NGRAM['symspell']): raise Exception( 'preprocessing is not available, please `validate = True`') if not check_available(PATH_NGRAM[1]): raise Exception( 'preprocessing is not available, please `validate = True`') try: from symspellpy.symspellpy import SymSpell, Verbosity except: raise Exception( 'symspellpy not installed. Please install it and try again.') sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = PATH_NGRAM['symspell']['model'] sym_spell.load_dictionary(dictionary_path, term_index, count_index) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return _SymspellCorrector(sym_spell, Verbosity.ALL, corpus, k=top_k)
def __new__(cls): if cls._instance is None: ##Symspell configuration max_edit_distance_dictionary= 3 prefix_length = 4 spellchecker = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") spellchecker.load_dictionary(dictionary_path, term_index=0, count_index=1) spellchecker.load_bigram_dictionary(dictionary_path, term_index=0, count_index=2) cls._instance=spellchecker return cls._instance
def test_lookup_compound_ignore_non_words(self): print(' - %s' % inspect.stack()[0][3]) cwd = path.realpath(path.dirname(__file__)) dictionary_path = path.realpath( path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who " "couqdn'tread in SIXTHgrade and ins pired him") correction = ("where is the love 123 he had dated for much of THEPAST " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the DHIRD 1 quarter of last year he had learned " "of a secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY " "of 12 funn") correction = ("the biggest players in the strong summer film slate " "with PLETY of 12 fun") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible 1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible 1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "PI on leave, arrange Co-I to do screening" correction = "PI on leave arrange co i to do screening" results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
def correct_spelling(sentence): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 5 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return if "& ;" in sentence: sentence = sentence.replace("& ;", "and") max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(sentence, max_edit_distance_lookup) save = "" for suggestion in suggestions: save = suggestion.term #print("{}".format(save)) break #if "#" in save: # save = sym_spell.word_segmentation(save) return save
def spelling_preprocessor(): import os from symspellpy.symspellpy import SymSpell, Verbosity max_edit_distance_dictionary = 2 prefix_length = 7 sc = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.join(os.getenv('HOME'), 'symspellpy/symspellpy/frequency_dictionary_en_82_765.txt') term_index = 0 count_index = 1 if not sc.load_dictionary(dictionary_path, term_index, count_index): raise ImportError('Unable to load spelling dictionary') max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST @string_check def checker(s): words = s.split() corrected_words = list() for word in words: correction = sc.lookup(word, suggestion_verbosity, max_edit_distance_lookup) if correction: corrected_words.append(correction[0].term) else: corrected_words.append(word) return ' '.join(corrected_words) return checker
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/dictionary/dictionary.txt") # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "bangeeet" # misspelling # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
def spelling_correction(data,column): from symspellpy.symspellpy import SymSpell , Verbosity # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = "frequency_dictionary_en_82_765.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL df_final = pd.DataFrame() for index , row in data.iterrows(): # lookup suggestions for single-word input strings text = row[column] # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) for input_term in text.split(): suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) if len(suggestions)>0: df_local = pd.DataFrame({'Original Word':[input_term],'Replacement':[suggestions[0].term]}) df_final = df_final.append(df_local) return df_final
def setup(initial_capacity=83000, prefix_length=7, max_edit_distance_dictionary=2): global maximum_edit_distance maximum_edit_distance = max_edit_distance_dictionary dict_path = '/home/fa6/data/symspellpy/frequency_dictionary_en_82_765.txt' sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length, count_threshold=30) term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dict_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings # input_term = "memebers" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) # max_edit_distance_lookup = 2 # suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL # suggestions = sym_spell.lookup(input_term, suggestion_verbosity, # max_edit_distance_lookup) # # display suggestion term, term frequency, and edit distance # for suggestion in suggestions: # print("{}, {}, {}".format(suggestion.term, suggestion.count, # suggestion.distance)) return sym_spell
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 3 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 3 f = open("note.html", "r") noteString = f.read() noteString = stripHTML(noteString) print(noteString) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him. But who aree yooui to say its not. I am.") tstart = datetime.now() suggestions = sym_spell.lookup_compound(noteString, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) tend = datetime.now() time = tend - tstart print(time.seconds)
def create_context_speller(): """Creates a context speller, which uses the context frequency lookup table""" # Initialize Context Symspell Checker context_sym_spell = SymSpell(83000, 2, 7) # load dictionary lookup_path = os.path.join(os.path.dirname( __file__), "./data/dict/context_dist_small.txt") if not context_sym_spell.load_dictionary(lookup_path, 0, 1): raise Exception("Dictionary file not found") # Creates the spell checker def check_spell(word): suggestions = context_sym_spell.lookup(word, Verbosity.CLOSEST, 2) if len(suggestions) == 0: # Not in context return True else: correct = True for suggestion in suggestions: if suggestion.distance == 1: correct = False return correct return check_spell
def correctly_spelled(data, max_edit_distance_lookup=None): global sym_speller # Make the SymspellPy-based speller global to be able to be used in the body of this function if sym_speller is None: # If the speller is not initialized sym_speller = SymSpell( max_edit_distance_dictionary, prefix_length) # Initialize the speller provided its parameters as # previously defined sym_spell_dict_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt" ) # Load the frequency dictionary # to the speller term_index = 0 # Column of the term in the dictionary text file count_index = 1 # Column of the term frequency in the dictionary text file if not sym_speller.load_dictionary( sym_spell_dict_path, term_index, count_index): # If the dictionary was not found print("ERROR! SymSpellPy dictionary not found at following path:", sym_spell_dict_path ) # Print error message informing about this os._exit(1) # Exit the entire program if max_edit_distance_lookup is None: # If no maximum edit distance during lookup is specified max_edit_distance_lookup = max_edit_distance_dictionary # Assign the same edit distance to that as to the maximum edit distance # on the dictionary # Correct spelling of each token in the text and return the data sample return " ".join([ (sym_speller.lookup_compound(t, max_edit_distance_lookup)[0].term if t.isalpha() and not (t == data[0] or t == data[1] or ("".join([x[0] for x in data[1].split()]) == t if len(data[1].split()) >= 3 else False)) else t) for t in tokenized(data[2]) ])
def getSymspellDict(direc): print("loading symspell object") sym_spell = SymSpell(83000, 2, 7) if not sym_spell.load_dictionary(direc, 0, 1): print("Dictionary file not found") return sym_spell
def spell_correction(texte): max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = "../ressources/fr-100k.txt" bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") if not sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") return if not sym_spell.load_bigram_dictionary( bigram_path, term_index=0, count_index=2): print("Bigram dictionary file not found") return input_term = texte # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) if (len(suggestions) > 0): return suggestions[0].term else: print("error with : ", texte) return texte
class SpellCorrector(): def __init__(self, max_edit_distance_dictionary=2, prefix_length=7): self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname('../'), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index): raise("Dictionary file not found") # manually # this works. about 0.003 up # self.corr_dict = {"awsome": "awesome"} def reduce_lengthening(self, text): # not work pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1", text) def strip_punc(self, word): # not work return re.sub(r"[\-\_\.\!]$", "", word) def __call__(self, word): word = self.reduce_lengthening(word) # if word in self.corr_dict: # word = self.corr_dict[word] if len(word) > 2 and "'" not in word: suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, 2) if suggestions: return suggestions[0].term return word
def load(cls, language: str) -> "SpellCorrectGenerator": # maximum edit distance per dictionary pre-calculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) if language == "en": dict_path = ( pathlib.Path(__file__).parent / "resources" / "frequency_dictionary_en_82_765.txt" ) sym_spell.load_dictionary(str(dict_path), term_index=0, count_index=1) spacy_model = spacy.load("en_core_web_sm", disable=["parser", "ner"]) else: raise RuntimeError(f"The language {language} is currently not language.") return cls(sym_spell, spacy_model)
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 9 # data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv') sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings # input_term = "agricultr" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL s = "" # print('original') # print(len(words)) # for i in range(len(data)): # # print(i) # if i==0 or i==51124 or i==65070: # continue # input_term = data['Final_words'][i] # suggestions = sym_spell.lookup(input_term, suggestion_verbosity, # max_edit_distance_lookup) # print(i) # try: # s = s + str(suggestions[0].term)+" " # except: # s = s+ input_term # # s = s[:-1] # words = s.split(' ') # # print(len(words)) # print('After') # print(len(words)) # for suggestion in suggestions: # print("{}, {}, {}".format(suggestion.term, suggestion.distance, # suggestion.count)) # input_term = ("whereis th elove hehad dated forImuch of thepast who " # "couqdn'tread in sixtgrade and ins pired him") input_term = 'live' # max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
def symspell_test(tokenpos_list, max_edit_distance_lookup=3, initial_capacity=83000, max_edit_distance_dictionary=3, prefix_length=7, term_index=0, count_index=1): """ This is a function that tests the SymSpell library for spell-checking performance. Key-word arguments are: ** max_edit_distance_lookup : (Recommended maximum = 3) ** term_index : term column in dictionary (0) ** count_index : frequency column in dictionary (1) """ print('\n{} \nBegin \'Symspellpy\' testing \n'.format('#' * 20)) try: sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) suggestion_verbosity = Verbosity.CLOSEST dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return 'Error loading dictionary file' suggestion_list = [] proper_noun = [] for (word, pos) in tokenpos_list: if pos == 'PROPN': suggestion_list.append(word) proper_noun.append(word) elif len(word) < 3: suggestion_list.append(word) proper_noun.append(word) else: suggestions = sym_spell.lookup(word, suggestion_verbosity, max_edit_distance_lookup) suggestion = (list(suggestions))[0] # display suggestion term, term frequency, and edit distance print( "input_term = {}, suggestion_term = {}, suggestion_count = {},\ suggestion_distance = {}".format(word, suggestion.term, suggestion.count, suggestion.distance)) suggestion_list.append(suggestion.term) print("\n\nThe corrected sentence is : {}".format( ' '.join(suggestion_list))) print(suggestion_list) print(proper_noun) return suggestion_list, proper_noun except TypeError as error: print(f'Invalid type : {error}') return 405
def init(): ''' Init symspellpy, loading the frequency words models (dictionary and bigram dictionary) ''' global sym_spell max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) #sym_spell.load_dictionary(os.path.dirname(os.path.abspath(__file__)) + "/frequency_words_models/pt_frequency_50k.txt", term_index=0, count_index=1) sym_spell.load_dictionary(os.path.dirname(os.path.abspath(__file__)) + "/frequency_words_models/fw_pt.txt", term_index=0, count_index=1) sym_spell.load_bigram_dictionary( os.path.dirname(os.path.abspath(__file__)) + "/frequency_words_models/fw_bi_pt.txt", term_index=0, count_index=2)
def main(argv): if len(argv) == 3: input = argv[1] markdown = argv[2] else: print ('usage:\n python .py "<categoria>" <markdown gerado>') return initial_capacity = 83000 max_edit_distance_dictionary = 3 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) dictionary_path = "category_count.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return categorys = open(dictionary_path, 'r') d = defaultdict(lambda: 0) for x in categorys.readlines(): z = x.split(' ') d[z[0]] = z[2] f = open(markdown, 'a') f.write('\n## Lista de Exercicios - %s\n' % (input).capitalize()) input = input.lower() suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL inputs = input.split(' ') total_avg = sum( map(len, inputs) ) / len(inputs) max_edit_distance_lookup = 3 if total_avg > 4 else 2 for input_term in inputs: suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) for suggestion in suggestions: f.write("* {}, https://a2oj.com/{}".format((suggestion.term).capitalize(), d[suggestion.term])) f.close() categorys.close()