def __init__(self, username, client_id, client_secret, redirect_url,user): super(SpotipyModule, self).__init__() # initalizes variables self.username = username self.client_id = client_id self.client_secret = client_secret self.redirect_url = redirect_url self.scope = 'user-library-read user-read-playback-state streaming' \ ' playlist-modify-public user-modify-playback-state playlist-modify-private ' \ 'user-read-playback-position user-read-currently-playing user-read-private' self.user = user self.spellchecker = spellchecker.SpellChecker(language=u'en', distance=2) self.queue_id = None self.playing = False self.queue_uri = None self.queue_changed = False self.current_queue = None self.playlist_ids = None self.playlist_names = None self.search_results = None self.dur_time = 0 self.artist = "" self.title = "" self.picture = qtc.QUrl() self.search_list = ListModel(SpotipyModule.SongWrapper) self.token = self.generate_token() self.queue = self.generate_queue() self.devices = self.token.devices()
def __init__(self): self.stop_words = frozenset(stopwords.words('english')) # helper functions for the hashtag calculation self.words = open("word_freq.txt").read().split() self.wordcost = dict((k, log((i + 1) * log(len(self.words)))) for i, k in enumerate(self.words)) self.maxword = max(len(x) for x in self.words) self.spell = spellchecker.SpellChecker() # Months dictionary to support our new rule,saving all dates in the same format self.month = { "jan": "01", "january": "01", "feb": "02", "february": "02", "mar": "03", "march": "03", "apr": "04", "april": "04", "may": "05", "jun": "06", "june": "06", "jul": "07", "july": "07", "aug": "08", "august": "08", "sep": "09", "september": "09", "october": "10", "oct": "10", "nov": "11", "november": "11", "dec": "12", "december": "12" }
def __has_correct_spelling(word: str) -> str: spell_checker = spellchecker.SpellChecker() corrected_word = spell_checker.correction(word) if corrected_word != word: return False, corrected_word else: return True, word
def load_spellchecker(self): """ Load the spelling checker, including a built-in extended dictionary and optional project-level configurable word dictionary with words not to flag as misspelled. """ self.spell = spellchecker.SpellChecker(distance = 1) try: with resource_stream(__name__, EXTENDED_DICTIONARY) as f: self.load_words_from_file(f) except FileNotFoundError as e: recoverable_error("Couldn't load Dactyl built-in spelling file. " + "Using the default dictionary only.", self.config.bypass_errors, error=e) spelling_file = self.config.get("spelling_file", None) if not spelling_file: logger.debug("No spelling_file provided in config - skipping") return try: with open(spelling_file, "r", encoding="utf-8") as f: self.load_words_from_file(f) except FileNotFoundError as e: recoverable_error("Failed to load spelling_file %s: %s" % (spelling_file, e), self.config.bypass_errors, error=e)
def __init__(self, language: str) -> None: """Initialize self. :param language: language to check the spelling with """ super().__init__(language) self._ignored: T.Set[str] = set() try: self._dict = spellchecker.SpellChecker(language=language) except ValueError: raise DictNotFound(language)
def correct_text(): korean_dictionary_filepath = './my_korean_dictionary.json' text_filepath = './Sample-2_0.txt' #-------------------- print('Start loading dictionary...') start_time = time.time() spell = spellchecker.SpellChecker( language='en', distance=1) # Loads default word frequency list. spell.word_frequency.load_dictionary(korean_dictionary_filepath, encoding='UTF-8') print('End loading dictionary: {} secs.'.format(time.time() - start_time)) #-------------------- print('Start loading a text file...') start_time = time.time() try: with open(text_filepath, 'r', encoding='UTF-8') as fd: data = fd.read() words = data.split() except FileNotFoundError as ex: print('File not found: {}.'.format(text_filepath)) return except UnicodeDecodeError as ex: print('Unicode decode error: {}.'.format(text_filepath)) return print('End loading a text file: {} secs.'.format(time.time() - start_time)) #-------------------- # Find those words that may be misspelled. misspelled = spell.unknown(words) print('Words = {}.'.format(words)) print('Misspelled words = {}.'.format(misspelled)) for word in misspelled: print("Start correcting a word, '{}'...".format(word)) start_time = time.time() # Get the one 'most likely' answer. print('spell.correction({}) = {}.'.format(word, spell.correction(word))) # Get a list of 'likely' options. print('spell.candidates({}) = {}.'.format(word, spell.candidates(word))) print('End correcting a word: {} secs.'.format(time.time() - start_time))
def simple_korean_example(): spell = spellchecker.SpellChecker( language='en', distance=2) # Loads default word frequency list. if True: dictionary_filepath = './my_korean_dictionary.json' spell.word_frequency.load_dictionary(dictionary_filepath, encoding='UTF-8') else: text_filepath = './korean_modern_novel_1_2.txt' spell.word_frequency.load_text_file(text_filepath, encoding='UTF-8') # Find those words that may be misspelled. misspelled = spell.unknown(['천재즈변', '학교', '도시관', '도소관', '요기']) for word in misspelled: # Get the one 'most likely' answer. print('spell.correction({}) = {}.'.format(word, spell.correction(word))) # Get a list of 'likely' options. print('spell.candidates({}) = {}.'.format(word, spell.candidates(word))) print("spell.word_probability('학교') = {}.".format( spell.word_probability('학교'))) #text_data = "A blue whale went for a swim in the sea. Along it's path it ran into a storm. To avoid the storm it dove deep under the waves." #spell.word_frequency.load_text(text_data) # If I just want to make sure some words are not flagged as misspelled. spell.word_frequency.load_words(['마이크로소프트', '애플', '구글']) print("spell.known(['마이크로소프트', '구글', '페이스북']) = {}.".format( spell.known(['마이크로소프트', '구글', '페이스북']))) # Will return both now! spell.word_frequency.load_words(['microsoft', 'apple', 'google']) print("spell.known(['microsoft', 'google', 'facebook']) = {}.".format( spell.known(['microsoft', 'google', 'facebook']))) # Will return both now! print('len(spell.word_frequency.dictionary) = {}.'.format( len(spell.word_frequency.dictionary))) print('spell.word_frequency.total_words = {}.'.format( spell.word_frequency.total_words)) print('spell.word_frequency.unique_words = {}.'.format( spell.word_frequency.unique_words)) print('len(spell.word_frequency.letters) = {}.'.format( len(spell.word_frequency.letters))) print('spell.word_frequency.longest_word_length = {}.'.format( spell.word_frequency.longest_word_length))
def spellcheck_tokens(tokens, language="en", distance=1): '''Spell-checks the provided list of tokens and returns the corrected ones. Parameters ---------- tokens : the list of tokens to be spell-checked language : the language to perform the spell-check in distance : can be set to 1 or 2. It considers all words that are 1 or 2 letter permutations away (i.e. number of letter swaps) and then selects the most frequent one. The creator of the package suggests to use a distance of 1 to avoid silly things happening in longer words.''' spell = spellchecker.SpellChecker(language=language, distance=distance) counter = 1 for token in tokens: for i, word in enumerate(token): token[i] = spell.correction(word) if counter % 1000 == 0: print(counter, "sentences checked.", end="\r", flush=True) counter += 1 return tokens
def spelling_correction(text): # take numbers out of the string if possible try: text = re.sub("\d+", ' ', text).lower() except (AttributeError, TypeError) as error: print(type(text)) # convert string to lowercase and take out some punctuation that is difficult to map to sentiment try: text = text.replace(',', ' ').replace('.', ' ') except (AttributeError, TypeError) as error: print(type(text)) # correct spelling spell = spellchecker.SpellChecker() misspelled = spell.unknown(word_tokenize(text)) corrections = [(word, spell.correction(word)) for word in list(misspelled)] for correction in corrections: text = text.replace(correction[0], correction[1]) return text
def __init__(self, name): BasicDictionary.__init__(self, name) self._dict = pyspellchecker.SpellChecker(self.name) self._dict.word_frequency.load_words(self._customDict)
import nltk from nltk.tokenize import word_tokenize import nltk.corpus from nltk.corpus import stopwords import warnings import pandas as pd import string from string import printable import random import time import spellchecker spell = spellchecker.SpellChecker() spell.word_frequency.load_words(['technion', 'haifa', 'ziv', 'neve', 'nave', 'shaanan']) nltk.download('stopwords') nltk.download('punkt') nltk.download('gutenberg') warnings.filterwarnings(action='ignore') GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up", "hey",) WHATSAPP = ['hi how are you', 'how are you', "what's up", 'hi there', 'sup', 'how are you doing'] WHATSAPP_RESPONSES = ["I'm great! Thanks", 'Wonderful thanks!', 'Everything is great thanks!'] GREETING_RESPONSES = ["hi", "hey", "howdy", "hi there", "hello"] THANKS_RESPONSES = ["You are welcome!", "Sure, no problem.", "Gladly", "Sure thing!", "Glad to assist."] POSITIVE_INPUTS = ("yes", "sure", "ok", "yes please", "k", "cool", "great", 'ok cool', 'awesome', 'yup', 'yhh') NEGATIVE_INPUTS = ("no", "no thanks", "na", "i'll pass", "forget it", "never mind",) LIMIT_INPUTS = ("up to", "not more than", "maximum", "max") STOPWORDS_ADD_ON = GREETING_RESPONSES + WHATSAPP + ["find", "i", "need", "to", "please", "want", "apartment", "something", "next", "would", "like", "lower", 'less', 'cheaper'] ROOMS_WORD = ['bd', 'rooms', 'room', 'space', 'spaces', 'bed room', 'bed rooms', 'bedroom', 'bedrooms', 'nis', 'shekels']
def construct_korean_dictionary_example(): text_filepath = './korean_modern_novel_1_2.txt' dictionary_filepath = './my_korean_dictionary.json' # In my_korean_dictionary.json: # { # "가": 1, # "나": 2, # "사과": 45, # "자전거": 60 # } if False: import konlpy #import nltk # Initialize the Java virtual machine (JVM). #konlpy.jvm.init_jvm(jvmpath=None, max_heap_size=1024) print('Start loading a text file...') start_time = time.time() try: with open(text_filepath, 'r', encoding='UTF-8') as fd: text_data = fd.read() except FileNotFoundError as ex: print('File not found: {}.'.format(text_filepath)) return except UnicodeDecodeError as ex: print('Unicode decode error: {}.'.format(text_filepath)) return print('End loading a text file: {} secs.'.format(time.time() - start_time)) # TODO [check] >> Is it good to extract nouns or do POS tagging? print('Start preprocessing texts...') start_time = time.time() #kkma = konlpy.tag.Kkma() #text_data = kkma.nouns(text_data) okt = konlpy.tag.Okt() text_data = okt.nouns(text_data) print('End preprocessing texts: {} secs.'.format(time.time() - start_time)) text_data = ' '.join(text_data) print('Start saving a Korean dictionary...') start_time = time.time() spell = spellchecker.SpellChecker(language=None) spell.word_frequency.load_text(text_data) spell.export(dictionary_filepath, encoding='UTF-8', gzipped=True) print('End saving a Korean dictionary: {} secs.'.format(time.time() - start_time)) else: print('Start saving a Korean dictionary...') start_time = time.time() spell = spellchecker.SpellChecker(language=None) spell.word_frequency.load_text_file(text_filepath, encoding='UTF-8') spell.export(dictionary_filepath, encoding='UTF-8', gzipped=True) print('End saving a Korean dictionary: {} secs.'.format(time.time() - start_time)) print('len(spell.word_frequency.dictionary) = {}.'.format( len(spell.word_frequency.dictionary))) print('spell.word_frequency.total_words = {}.'.format( spell.word_frequency.total_words)) print('spell.word_frequency.unique_words = {}.'.format( spell.word_frequency.unique_words)) print('len(spell.word_frequency.letters) = {}.'.format( len(spell.word_frequency.letters))) print('spell.word_frequency.longest_word_length = {}.'.format( spell.word_frequency.longest_word_length))
def simple_example(): spell = spellchecker.SpellChecker( language= 'en', # Supported languages: 'en', 'es', 'de', 'fr' and 'pt'. Defaults to 'en'. local_dictionary= None, # The path to a locally stored word frequency dictionary. If provided, no language will be loaded. distance=2, # The edit distance to use. Defaults to 2. tokenizer=None, case_sensitive=False) # Find those words that may be misspelled. misspelled = spell.unknown(['something', 'is', 'hapenning', 'here']) for word in misspelled: # Get the one 'most likely' answer. print('spell.correction({}) = {}.'.format(word, spell.correction(word))) # Get a list of 'likely' options. print('spell.candidates({}) = {}.'.format(word, spell.candidates(word))) print("spell.word_probability('here') = {}.".format( spell.word_probability('here'))) #-------------------- # If the Word Frequency list is not to your liking, you can add additional text to generate a more appropriate list for your use case. spell = spellchecker.SpellChecker() # Loads default word frequency list. if False: # In my_dictionary.json # { # "a": 1, # "b": 2, # "apple": 45, # "bike": 60 # } dictionary_filepath = './my_dictionary.json' spell.word_frequency.load_dictionary(dictionary_filepath, encoding='UTF-8') elif False: text_filepath = './my_text.txt' spell.word_frequency.load_text_file(text_filepath, encoding='UTF-8') elif True: text_data = "A blue whale went for a swim in the sea. Along it's path it ran into a storm. To avoid the storm it dove deep under the waves." spell.word_frequency.load_text(text_data) # If I just want to make sure some words are not flagged as misspelled. spell.word_frequency.load_words(['microsoft', 'apple', 'google']) print("spell.known(['microsoft', 'google', 'facebook']) = {}.".format( spell.known(['microsoft', 'google', 'facebook']))) # Will return both now! print('len(spell.word_frequency.dictionary) = {}.'.format( len(spell.word_frequency.dictionary))) print('spell.word_frequency.total_words = {}.'.format( spell.word_frequency.total_words)) print('spell.word_frequency.unique_words = {}.'.format( spell.word_frequency.unique_words)) print('len(spell.word_frequency.letters) = {}.'.format( len(spell.word_frequency.letters))) print('spell.word_frequency.longest_word_length = {}.'.format( spell.word_frequency.longest_word_length)) print('spell.word_frequency.tokenize(text_data)) = {}.'.format( list(spell.word_frequency.tokenize(text_data)))) print('spell.word_frequency.keys()) = {}.'.format( list(word for idx, word in enumerate(spell.word_frequency.keys()) if idx < 20))) print('spell.word_frequency.words()) = {}.'.format( list(word for idx, word in enumerate(spell.word_frequency.words()) if idx < 20))) print('spell.word_frequency.items()) = {}.'.format( list(word for idx, word in enumerate(spell.word_frequency.items()) if idx < 20))) #-------------------- # If the words that you wish to check are long, it is recommended to reduce the distance to 1. spell = spellchecker.SpellChecker(distance=1) # Set at initialization. # Do some work on longer words. spell.distance = 2 # Set the distance parameter back to the default.
r'\mu', r'\nu', r'\xi', r'\omicron', r'\pi', r'\rho', r'\sigma', r'\tau', r'\upsilon', r'\phi', r'\chi', r'\psi', r'\omega', ] # spell_checker = spellchecker.SpellChecker(distance=1) spell_checker.word_frequency.remove_words(bad_words_in_spellchecker) spell_checker.word_frequency.load_words(greek_alphabet_latex_command) spell_checker.word_frequency.load_words(extra_special_words) # --------------------------------------------------------------------------- # # add program name to system error call def sys_exit(msg, file_in=None, section_name=None): if file_in != None: msg += '\nfile = ' + file_in if section_name != None: msg += ', section = ' + section_name sys.exit('bin/extract_md.py:\n' + msg)
def callback_inline(call): if call.data == "start": bot.send_message(call.message.chat.id, "Convert into text or simply scan", reply_markup=keyboards.keyboard_2) if call.data == "help": bot.send_message( call.message.chat.id, "Documentbot is a simple bot that does scan images from" "simple photos or docs. Spell checking with correcting is also " "provided, but recommended only with texts contain 10% of mistakes." "Have fun:)") if (call.data == "scan") or (call.data == "text"): if call.data == "scan": key = 1 if call.data == "text": key = 2 bot.send_message( call.message.chat.id, "OK. Send me a photo of a document. N/B! It has to be with sheet edges." ) if call.data == "ncorrect": data = json.load(open("data.json", encoding="utf8")) for connect in data: if connect["id"] == call.message.chat.id: data.remove(connect) output = open("data.json", "w") json.dump(data, output) output.close() if call.data == "correct": bot.send_message(call.message.chat.id, "processing") spell = spellchecker.SpellChecker() word = "" words = [] data = json.load(open("data.json")) for connect in data: if connect["id"] == call.message.chat.id: text = connect["text"] + " " print("TEXTTT \n" + text) for char in text: if (char != " ") and (char != "!") and (char != "?") and ( char != ".") and (char != ",") and (char != "\n"): word = word + char #print(word) if char == " " or char == "\n" and word != '': words.append(word) word = "" print("all words" + str(words)) wrong = spell.unknown(words) print("wrong words: " + str(wrong)) for w in wrong: low = w.lower() print(low) rep = spell.correction(low) if (rep != low): while text.find(w) > 0: i = text.find(w) text = text[:i] + rep + " " + text[i + len(w):] print("Corrected \n" + text) bot.send_message(call.message.chat.id, "corrected text: \n" + text) data.remove(connect) output = open("data.json", "w") json.dump(data, output) output.close()
args = parser.parse_args() distance = args.distance language = args.lang load = args.load predict = args.predict if distance is None: distance = 2 if language is None: language = 'en' if predict is not None: predict = 1 if load == 'hi': spell = spellchecker.SpellChecker(distance=distance) spell.word_frequency.load_text_file('./resources/hi.txt') else: spell = spellchecker.SpellChecker(language=language, distance=distance) while 1: print(">>", end=' ') s = input() if s == 'quit': print("Bye!") break s = s.split() misspelled = spell.unknown(s) correct = s.copy() # print(misspelled) a = []