def printCorrect(word): word_new = spell(word) if word_new != word: for i in word: pyautogui.press('backspace') pyautogui.press('backspace') pyautogui.typewrite(word_new + " ")
def cleanData(self, records, stemText=False): output = [] recordsChecked = 0 recordsToCheck = len(records) for sentence in records: sentence = str(sentence) recordsChecked += 1 sys.stdout.write("\rRecords cleaned : %i / %i" % (recordsChecked, recordsToCheck)) cleanSentence = '' if len(sentence) < 200: words = sentence.split(' ') for word in words: if len(word) < 12: if word.isalpha(): if stemText: cleanSentence += self.st.stem( spell(word.lower())) + ' ' else: cleanSentence += word.lower() + ' ' if cleanSentence: output.append(cleanSentence.strip()) sys.stdout.write("\n") sys.stdout.flush() self.cleanedRecords = output
def get_auto_corrected_words(words): """check each word in words to see if there is an autocorrect suggestion, then reutrn if there was an autocorrect, and the final""" autocorrected = False auto_corrected_words = [] for word in words: if spell(word) != word: autocorrected = True auto_corrected_words.append(spell(word)) else: auto_corrected_words.append(word) if autocorrected == False: return False else: return auto_corrected_words
def correctList(lines): for i in range(len(lines)): #print(lines[i]) sentence = lines[i].strip('\n') sentence = sentence.strip('.') if h1.match(sentence) or h2.match(sentence): continue else: # print("Sentence is: ", repr(sentence)) words = sentence.split() string = '' for word in words: if word in locations: string += word + ' ' elif num.search(word): string += word + ' ' elif word in punctuations: string += word + ' ' elif word == '...': string += word + ' ' else: corrected = spell(word) string += corrected + ' ' string = string.strip() lines[i] = string return lines
def fix_typos(sentence): typo_list = [] split_sent = sentence.split() for i, token in enumerate(split_sent): if not d.check(token) and str.isalnum(token): split_sent[i] = spell(token).lower() return ' '.join(split_sent)
def search(query): if request.method == 'POST': query = request.form['search_field'] query = spell(query) return redirect(url_for('search', query=query)) query_ = [query] re_item = "^" + query + "$" result = [ i for i, word in enumerate(master_df['name']) if re.search(re_item, word, flags=re.IGNORECASE) ] result += [ i for i, word in enumerate(master_df['name']) if re.search(query, word, flags=re.IGNORECASE) ] result = [ i for i, word in enumerate((master_df['title'])) if re.search(re_item, str(word), flags=re.IGNORECASE) ] result += [ i for i, word in enumerate((master_df['title'])) if re.search(query, str(word), flags=re.IGNORECASE) ] result = pd.Series(result).drop_duplicates().tolist() returned = [{ "name": master_df['name'].iat[idx], "time": master_df['time'].iat[idx], "title": (master_df['title'].iat[idx].split("https:", 1))[0], "url": master_df['url'].iat[idx] } for idx in result] return render_template('index.html', query=query, returned=returned)
def clean_text(text): # lower text text = text.lower() # tokenize text and remove puncutation text = [word.strip(string.punctuation) for word in text.split(" ")] # remove words that contain numbers text = [word for word in text if not any(c.isdigit() for c in word)] # remove stop words stop = stopwords.words('english') text = [x for x in text if x not in stop] # remove empty tokens text = [t for t in text if len(t) > 0] # pos tag text pos_tags = pos_tag(text) # lemmatize text text = [ WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags ] # remove words with only one letter text = [t for t in text if len(t) > 1] # modify mis-spelled text = [spell(reduce_lengthening(t)) for t in text] # remove non english and mis-spelled text = check_english(text) # join all text = " ".join(text) return (text)
def imgOcrEng(file_name): im = Image.open(file_name) text = pytesseract.image_to_string(im, lang='eng') fin = open('temp-extracted.txt','w') fin.write(text) fin.close() fhand = open('temp-extracted.txt') fout = open('extracted.txt','w') for line in fhand: line.rstrip() words=line.split() for word in words: word=spell(word)+' ' fout.write(word) fout.write('\n') fout.close() f = open("extracted.txt", "r") text = f.read() f.close() os.remove("temp-extracted.txt") #os.remove("extracted.txt") return text
def performSpellCorrection(featureObj): checker = SpellChecker("en_US", featureObj.getText()) for word in checker: word.replace(spell(word.word)) featureObj.getLexicalFeatures().setSpellCorrection(checker.get_text()) return featureObj
def convert_to_list(oov_string): oov_string = oov_string.split("'") oov_string = oov_string[1::2] for i in range(len(oov_string)): print oov_string[i] + ' ', oov_string[i] = spell(oov_string[i]) print oov_string[i] return oov_string
def autocorrect(self): line = '' for word in self.sentence.split(' '): if word.isdigit(): line = line + ' ' + word else: line = line + ' ' + spell(word) self.sentence = line
def spell_check(sentence): sentence2 = "" sentence = sentence.split() for i in sentence: sentence2 = sentence2 + spell(i) + ' ' sentence2 = sentence2[:-1] print("Sentence after spell check: ", sentence2) return sentence2
def auto_correct(self, vector): self.auto_correct_remaining -= 1 print('\rSpell auto-correct...', self.auto_correct_remaining, 'sentences remain', end=' ', flush=True) return [spell(word) for word in vector]
def autocorrect(self): line='' for word in self.sentence.split(' '): if word.isdigit(): line=line+' '+word else: line=line+' '+spell(word) self.sentence=line
def do_auto_correct(text): final_str = "" for x in text.split(): corrected = spell(x) final_str += corrected+" " if len(final_str)>2: final_str += final_str[:len(final_str)-1] return final_str
def correctMatch(match): word = str(match.group()) try: int(word) corrected_word = word except ValueError: corrected_word = spell(word) return corrected_word
def check_one_sentence(sentence): new_str = '' for word in sentence.split(): if len(word) == 1 and ord(word[0]) < 65: new_str += word + ' ' else: new_str += spell(word) + ' ' return new_str
def _AutocorrectAsNeeded(word: str): """ Detect if word needs spell correcting and return correct word """ if word in words.words(): return word else: return spell(word)
def correct_sentence(line): lines = line.strip().split(' ') new_line = "" similar_word = {} for l in lines: new_line += spell(l) + " " # similar_word[l]=spell.candidates(l) return new_line
def get_bot_response(): query = request.args.get('msg') query = [spell(w) for w in (query.split())] question = " ".join(query) response = k.respond(question) if response: return str(response) else: return str(":)")
def spell_correct(search_term): search_term = search_term.split() corrected_string = '' for term in search_term: corrected_term = term if term.isalpha(): corrected_term = spell(term) corrected_string += corrected_term+' ' return corrected_string.strip()
def autocorrect_spell(word): global count_spelling, count_words count_words = count_words + 1 if re.match("[^A-Za-z0-9]+", word): if word == '\'t': word = 'n\'t' return spell(word).lower() else: return word else: if word == '\'t': word = 'n\'t' #print("n't") new_string = spell(word).lower() if new_string != word: count_spelling = count_spelling + 1 incorrect_words.append(new_string + ',' + word) return new_string
def spell_unrecognized(vocab, tweet): words = re.split(r'\s+', tweet) new_words = [] for word in words: if word in vocab: new_words.append(word) continue new_words.append(spell(word)) return ' '.join(new_words)
def emphasize(word): if (word[:2] == '..'): return '...' new_word = re.sub(r'(.)\1{2,}', r'\1', word) if len(new_word) != len(word): return '<<' + spell(new_word) + '>>' else: return word
def _correct_word(self, text1): pattern = re.compile(r"(.)\1{2,}") text2 = pattern.sub(r"\1\1", text1) # reduce lengthening #if text1 != text2: # print(text1, text2) text3 = spell(text2).lower() # spell correction #if text2 != text3: # print(text2, text3) return text3
def word_correct(tweet): tweet = tweet.strip().split() for i in range(0, len(tweet)): word = tweet[i] if not wordnet.synsets(word): word = spell(word) tweet[i] = word tweet = ' '.join(tweet) return tweet
def evaluate(event): a = spell(e.get()) if (a.isalpha() == True): res.configure(text='''Meaning of: ''' + a + " \n" + str(dic.meaning(a))) else: tkMessageBox.showinfo("INVALID INPUT", ''' Please enter appropriate word!!!''')
def evaluate(event): a = spell(e.get()) if (a.isalpha() == True): res.configure(text='''Synomys of: ''' + a + " \n" + str(dic.synonym(a))) else: tkMessageBox.showinfo("INVALID INPUT", ''' Please enter a word!!!''')
def ocr_core(filename): config = ('-l eng --oem 1 --psm 3') """ This function will handle the core OCR processing of images. """ text = pytesseract.image_to_string( Image.open(filename), config=config ) # We'll use Pillow's Image class to open the image and pytesseract to detect the string in the image return spell(text.replace('\n', ''))
def preprocesstext(text, keyword, hashtag_remove = False, at_usr_remove = False, remove_url = False, replaceSpecialChars = True, replaceNumbers = True, convertToLowerCase = True, removeDefaultStopWords = True, removeGivenWords = False, stemWords = False, lemmatize = False, spellcorrect = False, word_list = []): assert isinstance(hashtag_remove,bool) assert isinstance(at_usr_remove,bool) assert isinstance(replaceSpecialChars,bool) assert isinstance(replaceNumbers,bool) assert isinstance(convertToLowerCase,bool) assert isinstance(removeDefaultStopWords,bool) assert isinstance(removeGivenWords,bool) assert isinstance(stemWords,bool) assert isinstance(lemmatize,bool) assert isinstance(spellcorrect,bool) assert isinstance(word_list,list) | isinstance(word_list,str) if hashtag_remove: hashtag = re.findall(r'#([A-Za-z0-9]+)', text) if len(hashtag) != 0: for i in hashtag: rep = '#' + i if keyword in i: text = text.replace(rep,' ') else: text = text.replace(rep,i) if at_usr_remove: at_usr = re.findall('@[^\s]+', text) for i in at_usr: text = text.replace(i,'') if remove_url: text = re.sub(r'((www\.[^\s]+)|(https://[^\s]+))','',text) text = re.sub("http\S+", "", text) text = re.sub("https\S+", "", text) text = re.sub(r"[^a-zA-Z0-9\s%s]" % re.escape(string.punctuation)," ",text) # removing characters other than punctuations, numbers and letters if replaceSpecialChars: text = re.sub(r"[^0-9A-Za-z']", " ", text) if replaceNumbers: text = re.sub(r"[0-9]", " ",text) text = re.sub(r'([[:alpha:]])\1+', r'\1\1',text) # removing extra instances of characters with more than 2 consecutive occurances if convertToLowerCase: text = str(text).lower() if removeDefaultStopWords: stopwords = "(^|\\s)(" + '|'.join(nltk.corpus.stopwords.words('english')) + ")(\\s|$)" text = re.sub(stopwords, " ", str(text)) if removeGivenWords and len(word_list) != 0: if type(word_list) == str: text = re.sub(word_list, " ", str(text)) else: otherwords = "(^|\\s)(" + str('|'.join(word_list)) + ")(\\s|$)" text = re.sub(otherwords, " ", str(text)) text = re.sub(r"\s+", " ", str(text)) # multiple whitespace characters collapsed to a single blank text = str(text).strip() if stemWords: text = " ".join([stem(y) for y in str(text).split() if y not in nltk.corpus.stopwords.words('english')]) if lemmatize: text = " ".join([lemmatizer.lemmatize(y) for y in str(text).split() if y not in nltk.corpus.stopwords.words('english')]) if spellcorrect: text = " ".join([spell(y) for y in str(text).split() if y not in nltk.corpus.stopwords.words('english')]) return(text)
def num_of_missplling(tweet): words = tweet.split() counter = 0 # words = speller.unknown(words) for word in words: if (word.isalpha()): if (spell(word) != word): counter += 1 return counter
def claim_r_claims_present(s1): try: s1_res = s1.split(' ') claim_ind = 0 claims_ind = 0 for s in s1_res: if spell(str(s)).lower() == "claim": claim_ind = 1 if spell(str(s)).lower() == "claims": claims_ind = 1 if claims_ind == 1: return "claims" elif claim_ind == 1: return "claim" else: return "NOA" except Exception as e: print("error in presence of claim or claims function" + str(e))
def spell_correct(text): text = text.split(' ') c_text = [] for word in text: if bool(re.match(r'\W', word)) is False: word = autocorrect.spell(word) c_text.append(word) c_text = ' '.join(c_text) return c_text
def spell_check(query): """ Takes an n-gram and fixes spelling """ query_list = query.split() for index, q in enumerate(query_list): query_list[index] = spell(q.strip()) suggestion = " ".join(query_list) return suggestion.strip()
def runSpellChecker(reviews): stemmer = PorterStemmer() index = 0 for review in reviews: for i in range(len(review)): review[i] = stemmer.stem(spell(review[i])) reviews[index] = review index += 1 return reviews
def spelltest(tests, verbose=False): n, bad, unknown, start = 0, 0, 0, time.clock() for target, incorrect_spellings in tests.items(): for incorrect_spelling in incorrect_spellings.split(): n += 1 w = spell(incorrect_spelling) if w != target: bad += 1 if not known([target]): unknown += 1 if verbose: print(MSG.format(incorrect_spelling, w, NLP_COUNTS[w], target, NLP_COUNTS[target])) return RESULT.format(bad, n, int(100. - 100. * bad / n), unknown, int(time.clock() - start))
def sample(scores, topics_file='lifelog_qrels/lifelogging_topics_formal.xml', max=50): xmldoc = minidom.parse(topics_file) topic_nodes = xmldoc.getElementsByTagName('topic') queries = "" for node in topic_nodes: topic = {} for tag in node.childNodes: if tag.nodeType == tag.ELEMENT_NODE: name, value = tag.tagName, tag.childNodes[0].nodeValue if name == 'narrative': queries += value query_terms = set(string_to_concepts(queries)) topics = [] concepts = {} terms = [] with open(os.path.dirname(os.path.abspath(__file__)) + '/data/stopwords.txt') as f: stopwords = f.read().split('\n') for term in query_terms: if term not in stopwords: topics.append(term) for term in scores: if term not in stopwords and term in topics: concepts[term] = scores[term] sorted_concepts = sorted(concepts.items(), key=lambda x: x[1], reverse=True) sorted_terms = [x[0] for x in sorted_concepts] indices = gen_log_space(len(sorted_terms), max) for i in indices: terms.append(sorted_terms[i]) for i in range(len(terms)): terms[i] = spell(terms[i]).lower() print(terms, len(terms))
def phraseSentence(msg): msg = msg.split(' ') msg_new = [] for i in msg: msg_new.append(autocorrect.spell(i).lower()) return msg_new
def spell_stem(txt): word_list = [stem(spell(word)) for word in txt] word_list = [word for word in word_list if word not in stops] return " ".join(word_list)