def get_lastest_reading(): global phrase while get_lastest_reading_thread.is_alive(): global lastest_reading label = lastest_reading if label != "": if label == "dot": if phrase != "": spell = Speller() phrase = spell(phrase) if lang != "en": translate_client = translate.Client() translated_test = translate_client.translate(phrase, target_language=lang)["translatedText"] voice = gTTS( text=translated_test, lang=lang, tld=top_domain[output_language], slow=False) else: voice = gTTS( text=phrase, lang=lang, tld=top_domain[output_language], slow=False) voice_file = "voice-{}.mp3".format(str(randint(1, 9999999))) voice.save(voice_file) play_audio_thread = threading.Thread(target=play_audio, args=[voice_file]) play_audio_thread.daemon = True play_audio_thread.start() print (TGREEN + "audio:" + phrase, ENDC) phrase = "" elif label == "space": phrase += " " spell = Speller() phrase = spell(phrase) print (TGREEN + phrase, ENDC) else: phrase += label print (TGREEN + label, ENDC) time.sleep(detection_speed) lastest_reading = ""
def correctSpelling(self): speller = Speller() word_tokens = word_tokenize(self.text) lenghthening = [self.reduce_lengthening(word) for word in word_tokens] spelled = [speller.autocorrect_word(word) for word in lenghthening] self.text = " ".join(spelled) return self
def loading_data(filename, basepath, extension): # Asumptions: The processed file is saved in the same folder with the pattern filename+'P' # Exist a column named "Comment" for which it is performed the data processing # Function read csv files and return a pandas dataframe, exist the csv file. # First check if the processed file exist, this is due the fact that performing spell correcting is time demanding, so it is # better to save and load the result onces the comments are processed processed_filename = filename + 'P' processed_path = os.path.join(basepath, processed_filename + '.' + extension) #Check if the processed file exist, if exist load this file. if os.path.exists(processed_path): data = pd.read_csv(processed_path) else: start_time = time.time() #Load the Speller spell = Speller(lang='es') #load the crude data data_init = pd.read_csv( os.path.join(basepath, filename + '.' + extension)) #Perform spell correction and use lower letters to the comment column data = spell_correction(data_init, 'Comment', 'processed comment', spell) #Eliminates stopwords. data = stopwords_correction(data, 'processed comment') #Save file with the format of processed file try: data.to_csv(processed_path) except: print("Oops!", sys.exc_info()[0], "occurred.") raise Exception("Couldn't save the dataframe") print(time.time() - start_time) return data
def preprocess_text(doc): # Lowercase doc = doc.lower() # Remove Numbers doc = re.sub(r'\d+', '', doc) # Remove unidecode doc = unidecode.unidecode(doc) # Remove special characters doc = re.sub(r'[^\w\s]', '', doc) # Spelling Check spell = Speller(lang='en') doc = spell(doc) # Remove stopwords and lemmatize doc = [lemmer.lemmatize(w) for w in doc.split() if w not in stop_word] # Remove rare words generated from lemmatization doc1 = [] common_and_rare_words = ['wa', 'ha', 'ive', 'im', 'youd', 'names'] for wd in doc: if wd not in common_and_rare_words: doc1.append(wd) return ' '.join(doc1)
def my_tokenizer(self, df, col_name): """ Creates a new column "tokens" in the dataframe, from column 'col_name'. Creates a list of lowercase words. Removes stopwords. Removes string containing no letter.""" # tokenize df['tokens'] = df[col_name].apply(lambda x: nltk.word_tokenize(x)) # lowercase df['tokens'] = df.tokens.apply(lambda l: [w.lower() for w in l]) def real_word_filter(): # load and remove set of stopwords stops = set(stopwords.words('English')) df['tokens'] = df.tokens.apply(lambda l: [w for w in l if w not in stops]) # keep only words with at least two letters, and keep those with dash (like 'fast-growing') #pattern = r"[a-z]+" pattern = r"^[a-z]+[-]?[a-z]+$" df['tokens'] = df.tokens.apply(lambda l: [w for w in l if bool(re.match(pattern, w))]) # remove stop words dans pattern match real_word_filter() # spell checker spell = Speller() df['tokens'] = df.tokens.apply(lambda l: [spell(w) for w in l]) # remove stop words dans pattern match, for a second time after spell checker real_word_filter()
def processChunk(self, list, output, procId): objs = { 'speller': Speller(), 'lemmatizer': WordNetLemmatizer(), 'stemmer': nltk.stem.SnowballStemmer('english'), 'mapper': Word2VecMapper(), 'stop': set(stopwords.words('english')) } csv = pd.DataFrame(columns=["id", "embedding", "polarity"]) flags = copy.copy(self.flags) for idx, value in enumerate(list): if (idx % 10 == 0): LOGGER.debug('{}, done {}/{}'.format( procId, idx + 1, int(len(self.data_set) / self.numberOfProcesses))) csv = self.processSingleDataSetValue(value[0], value[1], value[2], output, objs, flags, csv) LOGGER.debug("{}, finished processing".format(procId)) # output.cancel_join_thread() if ("-csv" in sys.argv): path = "processed_data_set" if self.set else "processed_test_set" csv.to_csv(self.config.readValue(path).split(".")[0] + "_" + str(procId) + ".csv", sep=";", index=False)
def text_preprocessing(data, text_cols): lemmatizer = WordNetLemmatizer() combined = pd.concat([data['train'], data['test']], axis=0) spell = Speller(fast=True) for col in text_cols: combined[col] = combined[col].apply(lambda x: x.lower() if isinstance(x, str) else x) stop_words = set(stopwords.words('english')) for col in text_cols: preprocessed_text = [] for words in combined[col]: if words is not np.nan: words = word_tokenize(words) words = [word for word in words if word.isalpha()] words = [word for word in words if word not in stop_words] words = [spell(word) for word in words] words = [lemmatizer.lemmatize(word) for word in words] preprocessed_text.append(' '.join(words)) else: preprocessed_text.append(np.nan) combined[col] = preprocessed_text data['train'] = combined.iloc[:len(data['train'])] data['test'] = combined.iloc[len(data['train']):]
def __init__(self, numberOfProcesses=mp.cpu_count() - 1, optional_length=None): if "--log" in sys.argv: logging.basicConfig(level=logging.DEBUG) if "-embedding" in sys.argv: self.EMBEDDING = True else: self.EMBEDDING = False self.EMBEDDING_LENGTH = 300 self.SEQUENCE_LENGTH = 2665 self.TIMEOUT = 15 self.OVERHEAD_TIMEOUT = 45 self.explorer = DataExplorer() self.resultsProcessor = ResultsProcessor() self.englishStopWords = set(stopwords.words('english')) self.text = '' self.config = Config() self.flags = { 'spelling': False, 'stopWords': False, 'lemmatize': False, 'stem': False, } self.speller = Speller() self.lemmatizer = WordNetLemmatizer() self.stemmer = nltk.stem.SnowballStemmer('english') self.numberOfProcesses = numberOfProcesses self.mapper = Word2VecMapper() self.optional_length = optional_length self.config = Config()
def predict(image_name): output_image_path = os.path.join("api", "temp") input_image_path = os.path.join(output_image_path, image_name) tokenizer = Tokenizer() model = MyModel(vocab_size=tokenizer.vocab_size, beam_width=20, stop_tolerance=15, reduce_tolerance=10) model.compile(learning_rate=0.001) model.load_checkpoint(target=target_path) imgproc.__execute__(input_image_path, output_image_path) text = [] confidence = [] image_lines = sorted( glob( os.path.join(output_image_path, image_name.split('.')[0], "lines", "*.png"))) for img in image_lines: img = pp.preprocess_image(img, target_image_size, predict=True) img = pp.normalization([img]) predicts, probabilities = model.predict(img, ctc_decode=True) predicts = tokenizer.sequences_to_texts(predicts) confidence.append(f"{predicts[0]} ==> {probabilities[0]}") text.append(Speller("en").autocorrect_sentence(predicts[0][0])) return "\n".join(text)
class AutoCorrectSpellingChecker(SpellingChecker): def __init__(self, spelling_config=None): SpellingChecker.__init__(self, spelling_config) self._speller = Speller() def correct(self, phrase): return self._speller.autocorrect_sentence(phrase).upper()
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] spell = Speller(lang='en') for (i, line) in enumerate(lines): if i == 0 or line == []: continue guid = "%s-%s" % (set_type, i) text_a = line[3] # generate_misspelling(line[3]) #try: # text_a = spell(text_a) #except: # pass text_b = line[4] #pos = text_b.find(text_a) #text_a = text_b[:pos] + " <b> " + text_b[pos:pos + len(text_a)] + " </b> " + text_b[pos + len(text_a):] #text_b = None if len(line) < 6 or line[5] == '?': label = self.get_labels()[0] else: label = line[5] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def process(self, message, **kwargs): from autocorrect import Speller spell = Speller(lang='en') mesg = message.text #get original message text = spell(mesg) #correct the message with autocorrect message.text = text #set the corrected message as the message for the next components to process
def replace_repetition(text, spell=True): text = re.sub(r"((\w)\2{2,})", r'\2', text) if spell: spell = Speller(lang='en') text = spell(text) return text
def preprocess(data, colname, applyList=[1, 2, 3]): def docPreprocess(badText): funcDict = {} funcDict[1] = lambda x: [spell(word) for word in x] funcDict[2] = lambda x: [ nltk.stem.SnowballStemmer('english').stem(word) for word in x ] funcDict[3] = lambda x: [ nltk.stem.WordNetLemmatizer().lemmatize(word) for word in betterText ] betterText = badText.split() betterText = [word for word in betterText if word not in stopWordsSet] betterText = [word for word in betterText if word not in freq] for i in applyList: betterText = funcDict[i](betterText) return betterText stopWordsSet = set(nltk.corpus.stopwords.words('english')) spell = Speller(lang='en') # data = data.copy() data[colname] = data[colname].str.replace("[0-9]", "").str.replace( "[^\s\w]", "").str.lower().str.encode('ascii', 'ignore').str.decode('ascii') freq = pd.Series(' '.join(data[colname]).split()).value_counts()[:10] data[colname] = data[colname].apply(docPreprocess) data = data.dropna() return data
def removeStopWords(self, text): spell = Speller(lang='en') clean_word_list = spell(text) clean_word_list = [ word for word in clean_word_list.split() if word not in stoplist ] return clean_word_list
def func(txt): tokenized = txt stop_words = set(stopwords.words('english')) wordslist = nltk.word_tokenize(tokenized) tagged = nltk.pos_tag(wordslist) spell = Speller(lang='en') words = [(spell(w), tag) for w, tag in tagged if not w in stop_words] return words
def remove_spelling_errors(list): '''remove spelling errors via speller module''' spell = Speller() new_list = [] for word in list: word_new = spell(word) new_list.append(word_new) return new_list
def auto_correct(text): spell = Speller(lang='en') tokens = word_tokenize(text) filtered_text = [] for i in tokens: filtered_text.append(spell(i)) filtered = ' '.join(filtered_text) return filtered
def stem_words(tokenized_data): stemmer = PorterStemmer() spell = Speller(lang='en') for i in range(len(tokenized_data)): tokenized_data[i] = stemmer.stem(tokenized_data[i]) return tokenized_data
def itt_OCR(image, config='--psm 4 --oem 1'): # oem 1, 2 -- psm 6, 11, 4, 1, 3, 12 pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe' text = pytesseract.image_to_string(image, lang='eng', config=config) spell = Speller() text = spell(text) print("text is: \n" + text) return text
def process_not_found_node(self, w): w = self.no_accent_vietnamese(w) spell = Speller(lang='en') predicted = spell(w) return { 'prediction': '' if predicted == w else predicted, 'm_eng': {}, 'm_vn': {} }
def main(args): vocab = build_vocab(json=args.caption_path, threshold=args.threshold, spell=Speller()) vocab_path = args.vocab_path with open(vocab_path, 'wb') as f: pickle.dump(vocab, f) print("Total vocabulary size: {}".format(len(vocab))) print("Saved the vocabulary wrapper to '{}'".format(vocab_path))
def __init__(self, corpus_size): self.vector_size = 300 self.speller_obj = Speller(lang='en') self.stop_words = many_stop_words.get_stop_words("en") self.spacy_obj = spacy.load('en_core_web_sm') self.tokenizer_obj = Tokenizer(num_words=corpus_size, oov_token="<OOV>") with open("normalize_mapping.json") as normalize_file_obj: self.normalize_mapping = json.load(normalize_file_obj)
def correct_spelling(data): spell = Speller(fast=True) tweets = {} for key, value in data: no_sym_emojis = re.sub(r' \:\) | \:\( | \:D | \:P | \:o | xD | \:\/ ', ' ', value) corrected = spell(no_sym_emojis) tweets[key] = " ".join(corrected.split()) return tweets
def correct_sentence(line): spell = Speller() lines = line.strip().split(' ') new_line = "" similar_word = {} for l in lines: new_line += spell(l) + " " # similar_word[l]=spell.candidates(l) return new_line
def correct_word(word): spell = Speller(lang='en') words = word.strip().split(' ') new_word = "" similar_word = {} for l in words: new_word += spell(l) + " " # similar_word[l]=spell.candidates(l) return new_word
def _main_(): processed_file_datasatisfaction = 'satisfaction_ratingP' processed_file_npsresponse = 'NPS_responsesP' if os.path.exists( os.path.join(os.getcwd(), 'data', processed_file_datasatisfaction + '.csv')): data_satisfaction = loading_data( os.path.join(os.getcwd(), 'data', processed_file_datasatisfaction + '.csv')) else: start_time = time.time() spell = Speller(lang='es') data_satisfaction = loading_data('data/satisfaction_Ratings.csv') data_satisfaction = spell_correction(data_satisfaction, 'Comment', 'processed comment', spell) save_dfdata(data_satisfaction, processed_file_datasatisfaction) print(time.time() - start_time) if os.path.exists( os.path.join(os.getcwd(), 'data', processed_file_npsresponse + '.csv')): data_responses = loading_data( os.path.join(os.getcwd(), 'data', processed_file_npsresponse + '.csv')) else: start_time = time.time() spell = Speller(lang='es') data_responses = loading_data('data/NPS_Responses.csv') data_responses = spell_correction(data_responses, 'Comment', 'processed comment', spell) save_dfdata(data_responses, processed_file_npsresponse) print(time.time() - start_time) print('there are ' + str(len(data_satisfaction['Ticket Id'].unique())) + ' unique tickets') print('there are ' + str(len(data_satisfaction)) + ' tickets') duplicate_tickets = data_satisfaction.groupby('Ticket Id').size( ).sort_values(ascending=False).reset_index(name='tickets count') duplicate_example = data_satisfaction[data_satisfaction['Ticket Id'] == duplicate_tickets['Ticket Id'][3]] satisfaction_wordcount = word_count(data_satisfaction, 'processed comment') print('finished...')
def get_bot_response(): spell = Speller() userText = request.args.get('msg') response = str(chat.chatbot_response(userText)) if response in [ "Sorry, can't understand you", "Please give me more info", "Not sure I understand" ]: userText = spell(userText) response = str(chat.chatbot_response(userText)) return response
def correct_sentence(self, str): ''' New function is used to correct typos ''' words = str.split() corrected_string = [] spell = Speller(lang='en') for w in words: lw = w.lower() lw = spell(lw) corrected_string.append(lw) return " ".join(corrected_string)
def check_spelling(search_word): spell = Speller(lang='en') search = search_word.split() corrected = [] for word in search: corrected.append(spell(word)) corrected = ' '.join(corrected) return corrected