def test_capitalization(self): ''' test that capitalization doesn't affect in comparisons ''' spell = SpellChecker(language=None) spell.word_frequency.add('Bob') spell.word_frequency.add('Bob') spell.word_frequency.add('Bab') self.assertEqual('Bob' in spell, True) self.assertEqual('BOb' in spell, True) self.assertEqual('BOB' in spell, True) self.assertEqual('bob' in spell, True) words = ['Bb', 'bb', 'BB'] self.assertEqual(spell.unknown(words), {'bb'}) known_words = ['BOB', 'bOb'] self.assertEqual(spell.known(known_words), {'bob'}) self.assertEqual(spell.candidates('BB'), {'bob', 'bab'}) self.assertEqual(spell.correction('BB'), 'bob')
def clean_up_sentence(sentence): spell = SpellChecker() # tokenize the pattern sentence_words = nltk.word_tokenize(sentence) # Spelling correction misspelled = spell.unknown(sentence_words) for i in sentence_words: if i in misspelled: sentence_words[sentence_words.index(i)] = spell.correction( i) # stem each word sentence_words = [ stemmer.stem(word.lower()) for word in sentence_words ] #print("after cleaning up ",sentence_words) return sentence_words
class SpellCheckDoc(BaseEstimator, TransformerMixin): def __init__(self): self.spell_ = SpellChecker(distance=1) def fit(self, X=None, y=None): return self def transform(self, X=None, y=None): print("correcting spelling") def _string_correction(doc): tokens = word_tokenize(doc) mispelled_words = self.spell_.unknown(tokens) return " ".join([self.spell_.correction(token) if (token.lower() in mispelled_words) else token for token in tokens]) translations = str.maketrans('', '', string.punctuation) return [_string_correction(doc.translate(translations)) for doc in X]
def spellCorrectBackupBaseline(self, check_str): """ Baseline spell checker uses spellchecker library """ print('spellCorrectBackupBaseline called') spell = SpellChecker() spell.known(['zwave', 'rheem']) splitted = check_str.split() for w_ix in range(len(splitted)): if splitted[w_ix].isalpha(): mis_check = list(spell.unknown([splitted[w_ix].lower()])) if len(mis_check) == 1: splitted[w_ix] = spell.correction(mis_check[0]) final_result = " ".join(splitted) # self.append_values[check_str] = final_result return final_result
def spellchecker_test(list_tokens, token_tags): """This is a function to test the SpellChecker library for spell-checking performance.""" print('\n{} \nBegin \'SpellChecker\' testing \n'.format('#' * 20)) try: spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(list_tokens) for word in misspelled: #print the incorrect word print(f'\nThe incorrect word is "{word}"') # Get the one `most likely` answer print( f'Using Spellchecker, the correction is : {spell.correction(word)}' ) return 0 except TypeError as error: print(f'Invalid string : {error}') return 405
def spellcheck(words): spell = SpellChecker(distance=1) # make sure some words are not flagged as misspelled spell.word_frequency.load_words([ '\n', '©', 'blog', 'website', 'monetization', 'php', 'analytics', 'seo', 'wordpress', 'mysql', 'html5', 'css3', 'google', 'drupal', 'facebook', 'youtube', 'linkedin' ]) # find those words that may be misspelled misspelled = spell.unknown(words) results = [] for word in misspelled: # Get the one `most likely` answer #suggestion = spell.correction(word) results.append(word) return results
def spelling_errors(dataset: list): spell_checker = SpellChecker() all_essay_words = [data_sample['essay_token'] for data_sample in dataset] _all_essay_error_words = [spell_checker.unknown(essay_words) for essay_words in all_essay_words] all_essay_error_words = [] for essay_error_words in _all_essay_error_words: error_words = [error_word for error_word in essay_error_words if error_word[0] != '@' and len(error_word) > 2] all_essay_error_words.append(error_words) num_of_error_words = [len(error_words) for error_words in all_essay_error_words] num_of_total_words = [len(words) for words in all_essay_words] spell_error_list = [] for data_sample, error_words_count, total_word in zip(dataset, num_of_error_words, num_of_total_words): data_sample['spelling_error_rate'] = math.pow(error_words_count / total_word, 1) spell_error_list.append(data_sample['spelling_error_rate']) return {'spelling_error_rate': {'mean': np.mean(spell_error_list), 'std': np.std(spell_error_list)}}
def spelling11(data1, text1): spell = SpellChecker() spell.word_frequency.load_text_file('corporaForSpellCorrection.txt') sent = data1[text1].str.split() for k in range(len(sent)): misspelled = spell.unknown(sent.iloc[k]) xd1 = '' for word in sent.iloc[k]: if word in misspelled: # Get the one `most likely` answer word = spell.correction(word) xd1 = xd1+' '+word else: xd1 = xd1+' '+word data1[text1].iloc[k] = xd1 return data1
def correct_spellings(text): """ @Desc : '检查文本内容的拼写错误' @Parameters : 'text' - '文本内容' @Returns : 'correct_text' - '处理后的文本' @Time : '2020/6/8 8:55' """ spell = SpellChecker() correct_text = [] misspelled_words = spell.unknown(text.split()) for word in text.split(): if word in misspelled_words: correct_text.append(spell.correction(word)) else: correct_text.append(word) return " ".join(correct_text)
class newToolKit: def __init__(self, nltktools): print("newToolKit instance") self.mynlp = nltktools self.spell = SpellChecker() def reduce_lengthening(self, text: str) -> str: pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1", text) def type_1_WordAutoFix(self, word: str) -> str: word = self.reduce_lengthening(word) return self.spell.correction(word) def type_1_textAutoFix(self, text: str) -> str: tokens = self.mynlp.tokenize_words(text) text = "" for i in tokens: text = text + self.type_1_WordAutoFix(i) + " " return text def type_1_textCandidates(self, text: str) -> list: word = self.mynlp.tokenize_words(text)[0] return self.spell.candidates(word) def unknown(self, text): tokens = self.mynlp.tokenize_words(text) return self.spell.unknown(tokens) def fixemall(self, text: str): x = self.mynlp.tokenize_words(text) text = "" for i in x: g = self.reduce_lengthening(i) g = self.type_1_WordAutoFix(g) text += g text += " " return text
async def ia_corrige_palavras(msg): try: chat_id = msg['chat']['id'] chat_type = msg['chat']['type'] if chat_type == 'supergroup' and msg.get('text'): texto = msg['text'] if 'text' in msg.get('reply_to_message') and texto.startswith( 'corrigir'): spell = SpellChecker(language='pt') mensagem = msg['reply_to_message']['text'] misspelled = spell.unknown( msg['reply_to_message']['text'].split()) palavra_errada = list(misspelled)[ 0] # retorna a palavra que estava errada na frase for palavra_final in misspelled: corrigir = spell.correction(palavra_final) candidatos = spell.candidates(palavra_final) mensagem_corrigida = mensagem.replace( palavra_errada, corrigir ) # nova frase com replace na palavra errada pela mais provavel. a = await bot.sendMessage( chat_id, f"@{msg['from']['username']} `aqui esta a frase corrigida, em 2 segundos irei mostrar outras alternativas caso existam:`\n***{mensagem_corrigida}***", 'markdown') time.sleep(2) for candidato in list( candidatos): # outras alternativas de correção alternativas_corrigidas = mensagem.replace( palavra_errada, candidato ) # novas frases com replace na palavra errada pelas outras mais provaveis await bot.editMessageText(( msg['chat']['id'], a['message_id'] ), f"`Algumas alternativas:`\n***{alternativas_corrigidas}***", 'markdown') time.sleep(2) await bot.editMessageText( (msg['chat']['id'], a['message_id']), f"`Correção:`\n***{mensagem_corrigida}***", 'markdown') except: pass return True
def view_essay_submission(request, essay_submission_id): if request.user.is_authenticated: template = "view_essay_submission.html" essay_submission = EssaySubmission.objects.get( id=int(essay_submission_id)) spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(essay_submission.content.split()) mispelled_list = [] for word in misspelled: current_list = [ word, spell.correction(word), spell.candidates(word) ] mispelled_list.append(current_list) complete_text = essay_submission.content complete_doc = nlp(complete_text) # Remove stop words and punctuation symbols words = [ token.text for token in complete_doc if not token.is_stop and not token.is_punct ] word_freq = Counter(words) # 5 commonly occurring words with their frequencies common_words = word_freq.most_common(5) # Unique words unique_words = [ word for (word, freq) in word_freq.items() if freq == 1 ] context = { 'essay_submission': essay_submission, 'mispelled_list': mispelled_list, 'common_words': common_words, 'unique_words': unique_words, } return render(request, template, context) else: return HttpResponseRedirect(reverse_lazy('index'))
def clusteredTopNouns(tag): topNounWords = createTopNounDict(tag) spell = SpellChecker(distance=1) # set at initialization misspelled = spell.unknown(list(topNounWords.keys())) for word in misspelled: if spell.correction(word) in topNounWords and word in topNounWords: topNounWords[spell.correction(word)] = topNounWords[ spell.correction(word)] + topNounWords.pop(word) wordvectors = {} for index, row in topNounWords.items(): wordvector = nlp(index).vector wordvectors[index] = wordvector X = np.zeros((len(wordvectors), 300)) for i, (word, vector) in enumerate(wordvectors.items()): X[i] = vector kmeans = cluster.KMeans(n_clusters=int(len(X) / 3), max_iter=1000) kmeans.fit(X) labels = kmeans.labels_ clusters = {} clsters = {} for (word, label) in zip([*wordvectors], labels): count = topNounWords[word] if label in clusters: clusters[label].append((word, count)) else: clusters[label] = [(word, count)] filteredFeatures = {} for clster in clusters: counter = list(zip(*clusters[clster]))[1] words = list(zip(*clusters[clster]))[0] word = words[counter.index(max(counter))] filteredFeatures[word] = max(counter) clsters[word] = words return filteredFeatures, clsters
def class_name_should_be_noun(self, identifier): # 3. Check if function have a verb in name spell = SpellChecker() misspelled_parts = spell.unknown(identifier.parts) identifier_parts_correct = list() for part in identifier.parts: if part not in misspelled_parts: identifier_parts_correct.append(part) pos_tags_result = nltk.pos_tag(identifier_parts_correct) print(pos_tags_result) number_of_names_in_class_name = 0 for pair in pos_tags_result: if pair[1] in ['NN', 'NNP', 'NOUN']: print(str(id), '\tVerb is found in {}'.format(pair)) number_of_names_in_class_name += 1 if number_of_names_in_class_name == 0: print(str(id), '\t class name should be noun') return True else: return False
def spell_check(doc, raw_terms): global all_matches spell = SpellChecker() spell.word_frequency.load_words(raw_terms) for token in doc: word = token.text misspelled = spell.unknown([word]) if len(misspelled) != 0 and not word.startswith("'"): all_matches += 1 total_matches["MISSPELLING"] = total_matches.get("MISSPELLING", 0) + 1 start = token.idx end = token.idx + len(token.text) if not overlap(match_ents, start, end) and not overlap( definitions, start, end) and not overlap( quantity_spans, start, end): match_ents.append({ "start": start, "end": end, "label": "MISSPELLING", })
def check(words): """ """ spell = SpellChecker() tknzr = TweetTokenizer() correctedWords = [] correctedSentence = " " allWords = tknzr.tokenize(words) # find those words that may be misspelled misspelled = spell.unknown(allWords) for word in allWords: correctWord = "" if word in misspelled: correctWord = spell.correction(word) elif len(word) == 1: pass else: correctWord = word correctedWords.append(correctWord) return correctedSentence.join(correctedWords)
class SpellCheck: def __init__(self): self.spell = None def spell_correct(self, x): """ Given a sentence x, this function will check, for each word, weather it was misspelled or not. :param x: :return: """ if self.spell is None: from spellchecker import SpellChecker self.spell = SpellChecker() word_list = word_tokenize(x) misspelled = self.spell.unknown(word_list) corrected_words_dict = dict([ (word, self.spell.correction(word)) for word in misspelled]) word_corrected = [corrected_words_dict.get(x,x) for x in word_list] return ' '.join(word_corrected) def spell_check(self, x): """ Given a sentence x, this function will return the same sentence bu removing all the words that were misspelled. Notice this function doesn't correct any misspelled words but just filter them. If you want to correct those wordspip install pyspellchecker you should use spell_check_correct. """ if self.spell is None: from spellchecker import SpellChecker self.spell = SpellChecker() word_list = word_tokenize(x) correct_words = self.spell.known(word_list) word_list_filtered = [x for x in word_list if x in correct_words] return ' '.join(word_list_filtered)
def allPermutations(str): spell = SpellChecker() # Get permutations permList = permutations(str) jumbledwords = [] for perm in list(permList): jumbledwords.append(''.join(perm)) misspelled = spell.unknown(jumbledwords) wellspelled = [] print("Please wait while we compute and show the final suggestions") for word in misspelled: # Get the one `most likely` answer correction = spell.correction(word) # Ensure length of the word is same as search key word if len(word) == len(correction): # If suggestion is in the list of permutations if correction in jumbledwords: # Temporary print to have the user glued on print(correction) # If not repeated if correction not in wellspelled: wellspelled.append(correction) # Print all suggestions print("\n\nFinal Suggestions!") for item in wellspelled: print(item)
class spaCySpellChecker(object): def __init__(self, nlp, custom_dictionary): Token.set_extension('spellchecker_unknown', default=None) self.checker = SpellChecker(language='es') self.custom_dictionary = custom_dictionary def __call__(self, doc): correct_words = [] for token in doc: text = self.custom_dictionary.get(token.text, token.text) try: misspelled = self.checker.unknown([text]) if misspelled: word = next(iter(misspelled)) correct = self.checker.correction(word) correct_words.append(correct) else: correct_words.append(text) except UnicodeEncodeError: pass return Doc(doc.vocab, words=correct_words)
def spell_check(sent, language): spell = SpellChecker(language=language) sent = sent.split() # find those words that may be misspelled misspelled = spell.unknown(sent) most_likely = dict() candidats = dict() for word in misspelled: # Get the one `most likely` answer most_likely[word] = spell.correction(word) # Get a list of `likely` options candidats[word] = spell.candidates(word) return (("The errors in the sentence are : {} ".format(misspelled)), ("The most likely answer : {}".format(most_likely)), ("The other likely options : {}".format(candidats)))
def spell_check(): spell = SpellChecker() spell.word_frequency.load_words(known) num = 0 for line in sys.stdin.readlines(): num += 1 words = list( filter( lambda w: w.lower() not in known and w[0] not in "'0123456789" and len(w) > 4 and all(ord(c) < 128 for c in w), re.findall(r"[\w']+", line))) known.update(map(lambda w: w.lower(), words)) print('#', num, end='\r') time.sleep(0.002) for identifier in words: misspelled = spell.unknown( re.split(r"(?<=[a-z0-9])(?=[A-Z])|_|(?<=[A-Z])(?=[A-Z][a-z])", identifier)) # Get the one `most likely` answer for word in misspelled: print('#', num, identifier, '\t', "s/%s/%s" % (word, spell.correction(word)))
def spellcheck(input_file): # Read word set from input_file with open(input_file, "r") as file: word_list = file.read().split("\n") # print(content_list) spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(word_list) index = 0 changes = [] for word in misspelled: if word != "": corrected = "" s = str(index) + ". " + word if not show_only_words: corrected = spell.correction(word) # Fetch the best autocorrect s += " --> " + corrected # Fetch a list of other potential spell options (optional) if multiple_recommendations: s += "; " + str( list(spell.candidates(word))[:max_recommendations]) if not only_show_words_with_recommendations or word != corrected: changes.append(s) index += 1 # Write results to file output_file with open(input_file.split(".")[0] + "_output" + ".txt", 'w') as writer: for c in changes: writer.write(c + "\n")
def main(): spell = SpellChecker() with open(CLASSES_PATH, 'r') as f: class_map = json.load(f) print('Number of initial classes: ', len(class_map.keys())) # Remove numbers for word in class_map.keys(): is_alpha_word = ''.join(e for e in word.lower() if not e.isdigit()) class_map[word] = is_alpha_word print( len(set(class_map.values())), ' number of classes remaining after removing special chars and numbers.' ) # find those words that may be misspelled misspelled = spell.unknown(list(class_map.values())) for i, word in enumerate(list(class_map.keys())): cleaned_word = class_map[word] # Update the class mapping if the cleaned word is misspelled according to SpellChecker if cleaned_word in misspelled: # Get the one `most likely` answer correction = spell.correction(cleaned_word) class_map[word] = correction if i % 500 == 0 and i != 0: print(i) with open('cleaned_classes.json', 'w') as f: json.dump(class_map, f, indent=4) print( len(set(class_map.values())), ' number of classes remaining after removing spelling errors according to SpellChecker.' )
def error_grammar_frequency_en(text): """ :param text: tout le texte du mail :return: """ spell = SpellChecker() # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] size_txt = len(tokens) misspelled = spell.unknown(tokens) # frequency of the word who have are misspelled try: freq = len(misspelled)/size_txt except: freq = 0.5 # transform in percent return freq*100
def preprocess_reviews(reviews): code = ("expertiza","travser","questionnare","realestatemodel","getrubricarray","questarray","curquestionnaire","iquiry","questionnarecontroller","vmquestionresponse","capybaraerror","interfce","reportstrategy","visiualisation","potentialbuyer","sepc","nilclass","supplementaryreviewquestionnaire","pannernode","realestatecompany","codeclimate","househunter","offscreencanvas","herokuapp","webaudio","nomethoderror","constantsourcenode","quesparams","assignmentparticipant","bluetoothdiscoverysession","webidl","offscreenrenderingcontext","offscreencanvasrenderingcontext","oscillatornode","rpec","gemfile","rubocop","requesteduser","travisci","lowfi","bacui","importfilecontroller","metareview","oodd","assignmentteam","selfreview","amazonaws","agenttourlist","signuptopic","functionalites","rubymine","setvaluecurveattime","gradinghistory","simplecov","bluetoothadapter","issuecomment","tourscontroller","metareviews","umls","webbluetooth","scenairos","getrole","rspecs","agenttest","potentialbuyercontroller","customeroptions","standarderror","nameurl","assignmentcontroller","addcreatetopicsection","createconstantsource","baseaudio","reviewbid","reviewbidcontroller","rscore","assignmentscontroller","adminpanel","testfolder","potentialbuyers","memebershave","stereocontext","droptopicdeadline","getphoto","applicationcontroller","searchcontroller","responsecontroller","generatereport","hasallprivilegesof","htmlcanvascontextrendering","webglrenderingcontext","reviewresponsemap","timezonepref","gradescontroller","gemlock","appendcreatetopicsection","offscreencanvascontextrendering","deletedtour","mdfile","getlocal","parsererror","reviewmapping","oodesign","ooddprogram","coverrange","usinggit","gitgit","herokuwould","navebar","numericality","repositoryhttps","edgecases","expertzia","metareviewer","sqlexception","experitza","gdrive","assignmentquestionnaire","questionnairecontroller","setcurrentvalueattime","uncaughtthrowerror","customerbookingscontroller","runningbundle","createhouseinformation","exertiza","staticpagescontroller","createtopic","addtopic","tourmanagement","functionalties","devcenter","googleuser","applicationrecord","factorybot","ereadme","inquirydetails","funtionalities","existassignment","modelsand","baseaudiocontext","constantsourceoptions","foreignmodel","bookmarkratingresponsemap","bookmarkratingquestionnaire","degin","audioparam","signupsheetcontroller","screencase","participantsuper","setposition","setorientation","waveshapernode","biquadfilternode","betahttps","gitusing","isrealtor","ishousehunter","addquestionnairetablerow","popupcontroller","hasmany","hasone","debugbranch","userscontroller","userr","heatgrid","architcture","flowchats","interfact" ) student = ("kunalnarangtheone","ychen","stereopannernode","swivl","ibwsfrvjmiytql","slwhv","iucqq","sidekiq","yzhu","nilaykapadia","jasminewang","bebacc","skannan","rustfmt","ocde","drupadhy","ajain","amody","upadhyaydevang","henlo","txmwju","kqbvycku","bdxxa","rxsun","bmyvjy","rommsw","travisbuddy","hhharden","appveyor","rahulsethi","rshakya","ziwiwww","nikitaparanjape","hounse","tourid","probablty","myaccounts","nainly","flazzle","folls","dhamang","dfef","afbc","eqsy","impliescode","jwarren","dodn","ferjm","jisx","coulhasdn","cbbdf","partipant","jwboykin","amogh","agnihotri","fdea","rbit","rbdoes","pronciple","sbasnet","kvtmnznc","ppvasude","ceec","edabe","namig","pptn","explainationit","urswyvyc" ) spell = SpellChecker() for i in range(len(reviews)): reviews[i] = re.sub(r'[^a-zA-Z0-9\s]',' ',reviews[i]) # Removing special character reviews[i] = re.sub('\'',' ',reviews[i]) # Removing quotes reviews[i] = re.sub('\d',' ',reviews[i]) # Replacing digits by space reviews[i] = re.sub(r'\s+[a-z][\s$]', ' ',reviews[i]) # Removing single characters and spaces alongside reviews[i] = re.sub(r'\s+', ' ',reviews[i]) # Replacing more than one space with a single space if 'www.' in reviews[i] or 'http:' in reviews[i] or 'https:' in reviews[i] or '.com' in reviews[i]: reviews[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", reviews[i]) reviews[i] = reviews[i].lower() for word in reviews[i].split(): if word in code: reviews[i] = reviews[i].replace(word,"code") elif word in student: reviews[i] = reviews[i].replace(word,"student") elif(bool(spell.unknown([word]))): recommended = spell.correction(word) print(recommended)
def translate(): translator = Translator() spell = SpellChecker(language=u'es') print("Enter word to translate. Enter ! when you are done.") #wordclasses = ['noun','verb','adjective','adverb','pronoun','preposition','conjunction','determiner','exclamation'] #while True: # find those words that may be misspelled word = str(input()).lower() misspelled = spell.unknown([word]) if len(misspelled) != 0: candidates = spell.candidates(word) s = '' for cand in candidates: s += cand + ", " s = s[:-2] print("The word was not found. Suggestions: " + s) trans = translator.translate(word, dest='en', src='es').extra_data['all-translations']
def spell_checker(text): split_sen = [] sentence = [text] split_sen = [i for item in sentence for i in item.split()] spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(split_sen) suggestions = [] likely_correct = [] for word in misspelled: # Get the one `most likely` answer likely_correct.append(spell.correction(word)) # Get a list of `likely` options suggestions.append(spell.candidates(word)) a_dict = dict() a_dict['suggestions'] = suggestions a_dict['misspelled'] = misspelled a_dict['likely_correct'] = likely_correct return json.dumps(a_dict, default=set_default)
def check_spelling( input_text_or_list: Union[str, List[str]], lang='en', ignore_word_file_path: Union[str, Path] = _IGNORE_SPELLCHECK_WORD_FILE_PATH ) -> str: """ Check and correct spellings of the text list """ if input_text_or_list is None or len(input_text_or_list) == 0: return '' spelling_checker = SpellChecker(language=lang, distance=1) # TODO: add acronyms into spell checker to ignore auto correction specified by _IGNORE_SPELLCHECK_WORD_FILE_PATH spelling_checker.word_frequency.load_text_file(ignore_word_file_path) if isinstance(input_text_or_list, str): if not input_text_or_list.islower(): input_text_or_list = input_text_or_list.lower() tokens = word_tokenize(input_text_or_list) else: tokens = [ token.lower() for token in input_text_or_list if token is not None and len(token) > 0 ] misspelled = spelling_checker.unknown(tokens) for word in misspelled: tokens[tokens.index(word)] = spelling_checker.correction(word) return ' '.join(tokens).strip()
def spellcheck(): if not session.get('logged_in'): return redirect(url_for('login')) if (request.method == 'POST'): spell = SpellChecker() try: file = request.files['wrongText'] except: file = None if (file): filename = secure_filename(file.filename) logger.info('User spell checked: "' + filename + '"') result = " " #Read the file, split into words, decode from bytes to string words = file.read() words = words.decode("utf-8") words = words.split() mispelled = spell.unknown(words) if (len(mispelled) > 0): for w in mispelled: result = result + ", " + w else: result = "No Typos here!" return render_template('spellcheck.html', webresult=result) else: return render_template('spellcheck.html', webresult="No Typos here") return render_template('spellcheck.html', webresult="None So Far!")
class NLPUtils: def __init__(self, language): ## declaration and initialization self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.spell = SpellChecker() ## English stopper words ", ".join(stopwords.words('english')) self.STOPWORDS = set(stopwords.words('english')) ## remove stop words def remove_stopwords(self, text): """custom function to remove the stopwords""" return " ".join( [word for word in str(text).split() if word not in self.STOPWORDS]) ## Word stemmer def stem_words(self, text): return " ".join([self.stemmer.stem(word) for word in text.split()]) ## word Lemmatization def lemmatize_words(self, text): return " ".join( [self.lemmatizer.lemmatize(word) for word in text.split()]) ## Correction of spelling def correct_spellings(self, text): corrected_text = [] misspelled_words = self.spell.unknown(text.split()) for word in text.split(): if word in misspelled_words: corrected_text.append(self.spell.correction(word)) else: corrected_text.append(word) return " ".join(corrected_text)
class Incubator: #Class variables: # sample_block_table: A dictionary containing all blocks in sample_path and their frequency # spellchecker: A pyspellchecker instance with all the words in words_path added # population: Total population of the incubator, indicating how many chromosomes exist at one time # elites: How many elites are carried over for each generation # children: How many children are created for each generation # randoms: How many random chromosomes are added each generation # tournament_size: How many chromosomes are considered in a tournament # cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one # mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one # shock_enabled: True if genetic shock enabled, false otherwise # shock_threshold: Number of cycles of fitness stagnation before genetic shock is triggered. # max_cycles: Cycle # at which the simulation terminates def __init__(self, sample_path, words_path, elites, children, randoms, tournament_size, cross_chance, mutation_chance, shock_value, max_cycles): #Parameters: # sample_path: A path to a samples source file containing all training data to be fed to the incubator # words_path: A path to all words which the cipher_breaker should consider valid in addition # to those already in pyspellchecker. # elites: How many elites are carried over for each generation # children: How many children are created for each generation # randoms: How many random chromosomes are added each generation # tournament_size: How many chromosomes are considered in a tournament # cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one # mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one # shock_value: 0 if genetic shock disabled. Otherwise shock is enabled and shock_threshold is set to shock_value # max_cycles: Cycle # at which the simulation terminates #Initializes sample_block_tables self.sample_block_table = self.getSampleBlockTable(sample_path) #Initializes spellchecker self.spellchecker = SpellChecker() self.spellchecker.word_frequency.load_text_file((words_path)) #Checks cross_chance and mutation_chance are valid assert (cross_chance + mutation_chance) == 1 #Loads all incubator paramaters self.elites = elites self.children = children self.randoms = randoms self.population = self.elites + self.children + self.randoms self.tournament_size = tournament_size self.cross_chance = cross_chance self.mutation_chance = mutation_chance #Handles shock_value if shock_value <= 0: self.shock_enabled = False self.shock_threshold = 0 else: self.shock_enabled = True self.shock_threshold = shock_value self.max_cycles = max_cycles #Prints incubator summary if verbose enables if __VERBOSE__: print("Incubator Summary:") print("sample_path: " + sample_path + " words_path: " + words_path) print("Total population: " + str(self.population)) print("Elites: " + str(self.elites) + " Children: " + str(self.children) + " Randoms: " + str(self.randoms)) print("Tournament size: " + str(self.tournament_size) + " Cross chance: " + str(self.cross_chance) + " Mutation chance: " + str(self.mutation_chance)) print("Shock enabled: " + str(self.shock_enabled) + " Shock threshold: " + str(self.shock_threshold)) print("Max cycles: " + str(self.max_cycles)) print("\n") """TRAINING FUNCTIONS""" #Takes ciphertext, returns a chromosome that should decrypt ciphertext def train(self, cipher_text): #Initializes cycle counter cycles = 0 #Generates pool of chromosomes chromosomes = [] for chromosome_iter in range(self.population): chromosomes.append(self.getRandomChromosome()) #Genetic shock trigger variables. Triggers if fitness is stagnant for shock_threshold cycles best_fitness = 0 shock_ticker = 0 #Starts timer start_time = time.time() while True: #Increments cycle counter cycles += 1 #Creates list of (chromosome, fitness) tuples in order of increasing fitness chromosome_fitness = [] #Checks all chromosomes to see if the correct one has been found for chromosome in chromosomes: if len(self.spellchecker.unknown((chromosome.convertText(cipher_text)).split(" "))) == 0: if __VERBOSE__: print("Found key! " + str(chromosome)) print("Decrypted text: " + chromosome.convertText(cipher_text)) print("") return (chromosome, cycles) #Gets fitness of each chromosome and sorts them according to fitness for chromosome in chromosomes: chromosome_fitness.append((chromosome, self.getFitness(chromosome, cipher_text))) chromosome_fitness.sort(key=lambda x: x[1]) chromosome_fitness.reverse() #Checks if max_cycles exceeded. If so, returns the fittest chromosome if cycles >= self.max_cycles: print("Best Key: " + str(chromosome_fitness[0][0])) print("Decrypted text: " + chromosome_fitness[0][0].convertText(cipher_text)) print("") return (chromosome_fitness[0][0], cycles) #Checks if fitness is stagnant if chromosome_fitness[0][1] <= best_fitness: shock_ticker += 1 else: best_fitness = max(chromosome_fitness[0][1], best_fitness) shock_ticker = 0 #If __VERBOSE__, provide report on most fit chromosome if __VERBOSE__: converted_text = chromosome_fitness[0][0].convertText(cipher_text) print("Cycle# " + str(cycles)) print("Best Chromosome: " + str(chromosome_fitness[0][0])) print("Fitness: " + str(chromosome_fitness[0][1])) print("Shock Ticker: " + str(shock_ticker)) print("Cycle Time: " + str(time.time()-start_time)) print("Attempted Decrypt: " + converted_text) print("Known words: " + str(self.spellchecker.known((chromosome_fitness[0][0].convertText(cipher_text).split(" "))))) print("Unknown words: " + str(self.spellchecker.unknown((chromosome_fitness[0][0].convertText(cipher_text).split(" "))))) print("") start_time = time.time() #Creates a new chromosomes list new_chromosomes = [] #Copies over elite to new chromosomes for chromosome_iter in range(self.elites): new_chromosomes.append(chromosome_fitness[chromosome_iter][0].clone()) #Creates children in new_chromsomes #Performs tournament process to select breeding candidates tournament_selections = [] while len(tournament_selections) < (self.children): tournament_selections.append(self.tournament(chromosome_fitness)) #Breeds selected candidates while len(tournament_selections)>0: chance = random.random() if chance < self.cross_chance and len(tournament_selections) > 1: chromosome_one = tournament_selections.pop() chromosome_two = tournament_selections.pop() crossed_chromosomes = self.crossChromosomes(chromosome_one, chromosome_two) new_chromosomes.append(crossed_chromosomes[0]) new_chromosomes.append(crossed_chromosomes[1]) elif chance < (self.mutation_chance + self.cross_chance): new_chromosomes.append(self.mutateChromosome(tournament_selections.pop())) else: new_chromosomes.append(self.getRandomChromosome()) #Adds random chromosomes to new_chromosomes for random_iter in range(self.randoms): new_chromosomes.append(self.getRandomChromosome()) #Checks if genetic shock should be triggered if shock_ticker >= self.shock_threshold and self.shock_enabled: if __VERBOSE__: print("Triggering genetic shock...\n") #Performs genetic shock, culling top 10% of population and mutation all others for chromosome_iter in range(len(new_chromosomes)): if self.getFitness(new_chromosomes[chromosome_iter], cipher_text) > .9 * best_fitness: new_chromosomes[chromosome_iter] = self.getRandomChromosome() else: new_chromosomes[chromosome_iter] = self.mutateChromosome(new_chromosomes[chromosome_iter]) #Resets shock tickers and trackers shock_ticker = 0 best_fitness = 0 #Shifts new_chromosomes into gene pool chromosomes = new_chromosomes #Returns a mutated chromosome def mutateChromosome(self, chromosome): new_chromosome = chromosome.clone() #Chooses two mappings to swap mutation_one_index = random.randint(0,25) mutation_two_index = random.randint(0,25) while mutation_two_index == mutation_one_index: mutation_two_index = random.randint(0,25) mutation_one = new_chromosome.mappings[mutation_one_index] mutation_two = new_chromosome.mappings[mutation_two_index] new_chromosome.removeMapping(mutation_one) new_chromosome.removeMapping(mutation_two) mapping_one = (mutation_one[0], mutation_two[1]) mapping_two = (mutation_two[0], mutation_one[1]) new_chromosome.addMapping(mapping_one) new_chromosome.addMapping(mapping_two) return new_chromosome #Takes two chromosomes and returns two crosses of those chromosomes in the format (new_chromosome_one, new_chromosome_two) def crossChromosomes(self, chromosome_one, chromosome_two): new_chromosome_one = chromosome_one.clone() new_chromosome_two = chromosome_two.clone() for chromosome_iter in range(26): if(random.random() > .5): old_mapping_one = new_chromosome_one.mappings[chromosome_iter] old_mapping_two = new_chromosome_two.mappings[chromosome_iter] if old_mapping_one != old_mapping_two: complement_mapping_one = new_chromosome_one.getMappingTarget(old_mapping_two[1]) complement_mapping_two = new_chromosome_two.getMappingTarget(old_mapping_one[1]) old_origin_one = complement_mapping_one[0] old_origin_two = complement_mapping_two[0] new_chromosome_one.removeMapping(complement_mapping_one) new_chromosome_two.removeMapping(complement_mapping_two) new_chromosome_one.removeMapping(old_mapping_one) new_chromosome_two.removeMapping(old_mapping_two) complement_mapping_one = (old_origin_two, complement_mapping_one[1]) complement_mapping_two = (old_origin_one, complement_mapping_two[1]) new_chromosome_one.addMapping(old_mapping_two) new_chromosome_one.addMapping(complement_mapping_two) new_chromosome_two.addMapping(old_mapping_one) new_chromosome_two.addMapping(complement_mapping_one) return (new_chromosome_one, new_chromosome_two) #Returns a new random chromosome def getRandomChromosome(self): new_chromosome = Chromosome() origin = [] destination = [] for letterIter in range(26): origin.append(chr(letterIter+97)) destination.append(chr(letterIter+97)) random.shuffle(destination) for mappingIter in range(26): new_chromosome.addMapping((origin[mappingIter], destination[mappingIter])) return new_chromosome #Performs a tournament selection of chromosomes based on tournament_size def tournament(self, chromosome_fitness): tournament_pool = [] for tournament_iter in range(self.tournament_size): tournament_pool.append(chromosome_fitness[random.randint(0, self.population-1)]) return (max(tournament_pool, key=lambda x: x[1]))[0].clone() #Takes a chromosome and cipher_text and evaluates the chromosomes fitness def getFitness(self, chromosome, cipher_text): total_fitness = 0 parsed_block_table = self.getBlockTable(chromosome.convertText(cipher_text)) for block in parsed_block_table.keys(): if block in self.sample_block_table.keys(): total_fitness += math.log(self.sample_block_table[block],2)*(parsed_block_table[block]) return total_fitness """ BLOCK FUNCTIONS """ #Returns the blocks located in the passed samples path. def getSampleBlockTable(self, sample_path): #Opens input file input_file = open(sample_path) block_table = {} for line in input_file: components = line.split(" ") components[1] = int(components[1][0:len(components[1])-1]) block_table[components[0]] = components[1] input_file.close() return block_table #Takes a string and returns a hash table of blocks def getBlockTable(self, input_string): block_table = {} input_words = input_string.split(" ") #Hashes blocks in dictionary to count them for word in input_words: word_blocks = self.getBlocks(word) for block in word_blocks: if block in block_table: block_table[block] += 1 else: block_table[block] = 1 return block_table #Returns all substrings of a passed string def getBlocks(self, input_string): blocks = [] for block_len in range(len(input_string)): start_point = 0 end_point = block_len+1 while end_point <= len(input_string): blocks.append(input_string[start_point:end_point]) end_point+=1 start_point+=1 return blocks
#!/usr/bin/env python ''' pip install pyspellchecker ''' from spellchecker import SpellChecker spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(['let', 'us', 'wlak','on','the','groun']) for word in misspelled: # Get the one `most likely` answer print(spell.correction(word)) # Get a list of `likely` options print(spell.candidates(word))