Esempio n. 1
0
    def test_capitalization(self):
        ''' test that capitalization doesn't affect in comparisons '''
        spell = SpellChecker(language=None)
        spell.word_frequency.add('Bob')
        spell.word_frequency.add('Bob')
        spell.word_frequency.add('Bab')
        self.assertEqual('Bob' in spell, True)
        self.assertEqual('BOb' in spell, True)
        self.assertEqual('BOB' in spell, True)
        self.assertEqual('bob' in spell, True)

        words = ['Bb', 'bb', 'BB']
        self.assertEqual(spell.unknown(words), {'bb'})

        known_words = ['BOB', 'bOb']
        self.assertEqual(spell.known(known_words), {'bob'})

        self.assertEqual(spell.candidates('BB'), {'bob', 'bab'})
        self.assertEqual(spell.correction('BB'), 'bob')
Esempio n. 2
0
        def clean_up_sentence(sentence):
            spell = SpellChecker()
            # tokenize the pattern
            sentence_words = nltk.word_tokenize(sentence)

            # Spelling correction
            misspelled = spell.unknown(sentence_words)
            for i in sentence_words:
                if i in misspelled:
                    sentence_words[sentence_words.index(i)] = spell.correction(
                        i)

            # stem each word
            sentence_words = [
                stemmer.stem(word.lower()) for word in sentence_words
            ]

            #print("after cleaning up ",sentence_words)
            return sentence_words
class SpellCheckDoc(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.spell_ = SpellChecker(distance=1) 
    
    def fit(self, X=None, y=None):
        return self
        
    def transform(self, X=None, y=None):
        print("correcting spelling")
        
        def _string_correction(doc):
            tokens = word_tokenize(doc)
            mispelled_words = self.spell_.unknown(tokens)
            return " ".join([self.spell_.correction(token) if
                             (token.lower() in mispelled_words) else token
                             for token in tokens])

        translations = str.maketrans('', '', string.punctuation)

        return [_string_correction(doc.translate(translations)) for doc in X]
Esempio n. 4
0
    def spellCorrectBackupBaseline(self, check_str):
        """
        Baseline spell checker
        uses spellchecker library
        """
        print('spellCorrectBackupBaseline called')
        spell = SpellChecker()
        spell.known(['zwave', 'rheem'])
        splitted = check_str.split()

        for w_ix in range(len(splitted)):
            if splitted[w_ix].isalpha():
                mis_check = list(spell.unknown([splitted[w_ix].lower()]))
                if len(mis_check) == 1:
                    splitted[w_ix] = spell.correction(mis_check[0])

        final_result = " ".join(splitted)
        # self.append_values[check_str] = final_result

        return final_result
Esempio n. 5
0
def spellchecker_test(list_tokens, token_tags):
    """This is a function to test the SpellChecker library for spell-checking performance."""
    print('\n{} \nBegin \'SpellChecker\' testing \n'.format('#' * 20))

    try:
        spell = SpellChecker()
        # find those words that may be misspelled
        misspelled = spell.unknown(list_tokens)

        for word in misspelled:
            #print the incorrect word
            print(f'\nThe incorrect word is "{word}"')
            # Get the one `most likely` answer
            print(
                f'Using Spellchecker, the correction is : {spell.correction(word)}'
            )
        return 0
    except TypeError as error:
        print(f'Invalid string : {error}')
        return 405
Esempio n. 6
0
def spellcheck(words):
    spell = SpellChecker(distance=1)

    # make sure some words are not flagged as misspelled
    spell.word_frequency.load_words([
        '\n', '©', 'blog', 'website', 'monetization', 'php', 'analytics',
        'seo', 'wordpress', 'mysql', 'html5', 'css3', 'google', 'drupal',
        'facebook', 'youtube', 'linkedin'
    ])

    # find those words that may be misspelled
    misspelled = spell.unknown(words)

    results = []
    for word in misspelled:
        # Get the one `most likely` answer
        #suggestion = spell.correction(word)
        results.append(word)

    return results
def spelling_errors(dataset: list):
    spell_checker = SpellChecker()

    all_essay_words = [data_sample['essay_token'] for data_sample in dataset]
    _all_essay_error_words = [spell_checker.unknown(essay_words) for essay_words in all_essay_words]

    all_essay_error_words = []
    for essay_error_words in _all_essay_error_words:
        error_words = [error_word for error_word in essay_error_words if error_word[0] != '@' and len(error_word) > 2]
        all_essay_error_words.append(error_words)

    num_of_error_words = [len(error_words) for error_words in all_essay_error_words]
    num_of_total_words = [len(words) for words in all_essay_words]

    spell_error_list = []
    for data_sample, error_words_count, total_word in zip(dataset, num_of_error_words, num_of_total_words):
        data_sample['spelling_error_rate'] = math.pow(error_words_count / total_word, 1)
        spell_error_list.append(data_sample['spelling_error_rate'])

    return {'spelling_error_rate': {'mean': np.mean(spell_error_list), 'std': np.std(spell_error_list)}}
Esempio n. 8
0
def spelling11(data1, text1): 
    
    spell = SpellChecker()
    spell.word_frequency.load_text_file('corporaForSpellCorrection.txt')
    sent = data1[text1].str.split()

    for k in range(len(sent)):     
        misspelled = spell.unknown(sent.iloc[k])
        xd1 = ''
        for word in sent.iloc[k]:
            if word in misspelled:
                # Get the one `most likely` answer
                word = spell.correction(word)
                xd1 = xd1+' '+word 
            else:
                xd1 = xd1+' '+word 
                
        data1[text1].iloc[k] = xd1 
        
    return data1
Esempio n. 9
0
def correct_spellings(text):
    """
    @Desc : '检查文本内容的拼写错误'
    @Parameters :
        'text' - '文本内容'
    @Returns :
        'correct_text' - '处理后的文本'
    @Time : '2020/6/8 8:55'
    """
    spell = SpellChecker()
    correct_text = []
    misspelled_words = spell.unknown(text.split())

    for word in text.split():
        if word in misspelled_words:
            correct_text.append(spell.correction(word))
        else:
            correct_text.append(word)

    return " ".join(correct_text)
Esempio n. 10
0
class newToolKit:
    def __init__(self, nltktools):
        print("newToolKit instance")
        self.mynlp = nltktools
        self.spell = SpellChecker()

    def reduce_lengthening(self, text: str) -> str:
        pattern = re.compile(r"(.)\1{2,}")
        return pattern.sub(r"\1\1", text)

    def type_1_WordAutoFix(self, word: str) -> str:
        word = self.reduce_lengthening(word)
        return self.spell.correction(word)

    def type_1_textAutoFix(self, text: str) -> str:
        tokens = self.mynlp.tokenize_words(text)
        text = ""
        for i in tokens:

            text = text + self.type_1_WordAutoFix(i) + " "

        return text

    def type_1_textCandidates(self, text: str) -> list:

        word = self.mynlp.tokenize_words(text)[0]
        return self.spell.candidates(word)

    def unknown(self, text):
        tokens = self.mynlp.tokenize_words(text)
        return self.spell.unknown(tokens)

    def fixemall(self, text: str):
        x = self.mynlp.tokenize_words(text)
        text = ""
        for i in x:
            g = self.reduce_lengthening(i)
            g = self.type_1_WordAutoFix(g)
            text += g
            text += " "
        return text
Esempio n. 11
0
async def ia_corrige_palavras(msg):
    try:
        chat_id = msg['chat']['id']
        chat_type = msg['chat']['type']
        if chat_type == 'supergroup' and msg.get('text'):
            texto = msg['text']
            if 'text' in msg.get('reply_to_message') and texto.startswith(
                    'corrigir'):
                spell = SpellChecker(language='pt')
                mensagem = msg['reply_to_message']['text']
                misspelled = spell.unknown(
                    msg['reply_to_message']['text'].split())
                palavra_errada = list(misspelled)[
                    0]  # retorna a palavra que estava errada na frase
                for palavra_final in misspelled:
                    corrigir = spell.correction(palavra_final)
                    candidatos = spell.candidates(palavra_final)
                    mensagem_corrigida = mensagem.replace(
                        palavra_errada, corrigir
                    )  # nova frase com replace na palavra errada pela mais provavel.
                    a = await bot.sendMessage(
                        chat_id,
                        f"@{msg['from']['username']} `aqui esta a frase corrigida, em 2 segundos irei mostrar outras alternativas caso existam:`\n***{mensagem_corrigida}***",
                        'markdown')
                    time.sleep(2)
                    for candidato in list(
                            candidatos):  # outras alternativas de correção
                        alternativas_corrigidas = mensagem.replace(
                            palavra_errada, candidato
                        )  # novas frases com replace na palavra errada pelas outras mais provaveis
                        await bot.editMessageText((
                            msg['chat']['id'], a['message_id']
                        ), f"`Algumas alternativas:`\n***{alternativas_corrigidas}***",
                                                  'markdown')
                        time.sleep(2)
                    await bot.editMessageText(
                        (msg['chat']['id'], a['message_id']),
                        f"`Correção:`\n***{mensagem_corrigida}***", 'markdown')
    except:
        pass
        return True
Esempio n. 12
0
def view_essay_submission(request, essay_submission_id):
    if request.user.is_authenticated:
        template = "view_essay_submission.html"
        essay_submission = EssaySubmission.objects.get(
            id=int(essay_submission_id))

        spell = SpellChecker()
        # find those words that may be misspelled
        misspelled = spell.unknown(essay_submission.content.split())
        mispelled_list = []
        for word in misspelled:
            current_list = [
                word, spell.correction(word),
                spell.candidates(word)
            ]
            mispelled_list.append(current_list)

        complete_text = essay_submission.content
        complete_doc = nlp(complete_text)
        # Remove stop words and punctuation symbols
        words = [
            token.text for token in complete_doc
            if not token.is_stop and not token.is_punct
        ]
        word_freq = Counter(words)
        # 5 commonly occurring words with their frequencies
        common_words = word_freq.most_common(5)

        # Unique words
        unique_words = [
            word for (word, freq) in word_freq.items() if freq == 1
        ]
        context = {
            'essay_submission': essay_submission,
            'mispelled_list': mispelled_list,
            'common_words': common_words,
            'unique_words': unique_words,
        }
        return render(request, template, context)
    else:
        return HttpResponseRedirect(reverse_lazy('index'))
Esempio n. 13
0
def clusteredTopNouns(tag):
    topNounWords = createTopNounDict(tag)

    spell = SpellChecker(distance=1)  # set at initialization
    misspelled = spell.unknown(list(topNounWords.keys()))
    for word in misspelled:
        if spell.correction(word) in topNounWords and word in topNounWords:
            topNounWords[spell.correction(word)] = topNounWords[
                spell.correction(word)] + topNounWords.pop(word)

    wordvectors = {}
    for index, row in topNounWords.items():
        wordvector = nlp(index).vector
        wordvectors[index] = wordvector

    X = np.zeros((len(wordvectors), 300))
    for i, (word, vector) in enumerate(wordvectors.items()):
        X[i] = vector

    kmeans = cluster.KMeans(n_clusters=int(len(X) / 3), max_iter=1000)
    kmeans.fit(X)
    labels = kmeans.labels_
    clusters = {}
    clsters = {}
    for (word, label) in zip([*wordvectors], labels):

        count = topNounWords[word]
        if label in clusters:
            clusters[label].append((word, count))
        else:
            clusters[label] = [(word, count)]

    filteredFeatures = {}
    for clster in clusters:
        counter = list(zip(*clusters[clster]))[1]
        words = list(zip(*clusters[clster]))[0]
        word = words[counter.index(max(counter))]
        filteredFeatures[word] = max(counter)
        clsters[word] = words

    return filteredFeatures, clsters
Esempio n. 14
0
    def class_name_should_be_noun(self, identifier):
        # 3. Check if function have a verb in name
        spell = SpellChecker()
        misspelled_parts = spell.unknown(identifier.parts)
        identifier_parts_correct = list()
        for part in identifier.parts:
            if part not in misspelled_parts:
                identifier_parts_correct.append(part)
        pos_tags_result = nltk.pos_tag(identifier_parts_correct)
        print(pos_tags_result)

        number_of_names_in_class_name = 0
        for pair in pos_tags_result:
            if pair[1] in ['NN', 'NNP', 'NOUN']:
                print(str(id), '\tVerb is found in {}'.format(pair))
                number_of_names_in_class_name += 1
        if number_of_names_in_class_name == 0:
            print(str(id), '\t class name should be noun')
            return True
        else:
            return False
Esempio n. 15
0
def spell_check(doc, raw_terms):
    global all_matches
    spell = SpellChecker()
    spell.word_frequency.load_words(raw_terms)
    for token in doc:
        word = token.text
        misspelled = spell.unknown([word])
        if len(misspelled) != 0 and not word.startswith("'"):
            all_matches += 1
            total_matches["MISSPELLING"] = total_matches.get("MISSPELLING",
                                                             0) + 1
            start = token.idx
            end = token.idx + len(token.text)
            if not overlap(match_ents, start, end) and not overlap(
                    definitions, start, end) and not overlap(
                        quantity_spans, start, end):
                match_ents.append({
                    "start": start,
                    "end": end,
                    "label": "MISSPELLING",
                })
Esempio n. 16
0
def check(words):
    """
    """
    spell = SpellChecker()
    tknzr = TweetTokenizer()
    correctedWords = []
    correctedSentence = " "
    allWords = tknzr.tokenize(words)
    # find those words that may be misspelled
    misspelled = spell.unknown(allWords)

    for word in allWords:
        correctWord = ""
        if word in misspelled:
            correctWord = spell.correction(word)
        elif len(word) == 1:
            pass
        else:
            correctWord = word
        correctedWords.append(correctWord)
    return correctedSentence.join(correctedWords)
Esempio n. 17
0
class SpellCheck:
    def __init__(self):
        self.spell = None

    def spell_correct(self, x):
        """
        Given a sentence x, this function will check,
        for each word, weather it was misspelled or not.


        :param x:
        :return:
        """
        if self.spell is None:
            from spellchecker import SpellChecker
            self.spell = SpellChecker()
        word_list = word_tokenize(x)
        misspelled = self.spell.unknown(word_list)
        corrected_words_dict = dict([ (word, self.spell.correction(word)) for word in misspelled])
        word_corrected = [corrected_words_dict.get(x,x) for x in word_list]
        return ' '.join(word_corrected)


    def spell_check(self, x):
        """
        Given a sentence x, this function will return
        the same sentence bu removing all the words that
        were misspelled.

        Notice this function doesn't correct any misspelled words
        but just filter them. If you want to correct those wordspip install pyspellchecker
        you should use spell_check_correct.
        """
        if self.spell is None:
            from spellchecker import SpellChecker
            self.spell = SpellChecker()
        word_list = word_tokenize(x)
        correct_words = self.spell.known(word_list)
        word_list_filtered = [x for x in word_list if x in  correct_words]
        return ' '.join(word_list_filtered)
Esempio n. 18
0
def allPermutations(str): 
       
    spell = SpellChecker()

    # Get permutations
    permList = permutations(str) 

    jumbledwords = []
    for perm in list(permList): 

        jumbledwords.append(''.join(perm))

    misspelled = spell.unknown(jumbledwords)
    wellspelled = []

    print("Please wait while we compute and show the final suggestions")
    for word in misspelled:

        # Get the one `most likely` answer
        
        correction = spell.correction(word)
        
        # Ensure length of the word is same as search key word
        if len(word) == len(correction):

            # If suggestion is in the list of permutations
            if correction in jumbledwords:

                # Temporary print to have the user glued on
                print(correction)

                # If not repeated
                if correction not in wellspelled:
                    wellspelled.append(correction)


    # Print all suggestions
    print("\n\nFinal Suggestions!")
    for item in wellspelled:
        print(item)
Esempio n. 19
0
class spaCySpellChecker(object):
    def __init__(self, nlp, custom_dictionary):
        Token.set_extension('spellchecker_unknown', default=None)
        self.checker = SpellChecker(language='es')
        self.custom_dictionary = custom_dictionary

    def __call__(self, doc):
        correct_words = []
        for token in doc:
            text = self.custom_dictionary.get(token.text, token.text)
            try:
                misspelled = self.checker.unknown([text])
                if misspelled:
                    word = next(iter(misspelled))
                    correct = self.checker.correction(word)
                    correct_words.append(correct)
                else:
                    correct_words.append(text)

            except UnicodeEncodeError:
                pass
        return Doc(doc.vocab, words=correct_words)
Esempio n. 20
0
def spell_check(sent, language):

    spell = SpellChecker(language=language)

    sent = sent.split()

    # find those words that may be misspelled
    misspelled = spell.unknown(sent)

    most_likely = dict()
    candidats = dict()

    for word in misspelled:
        # Get the one `most likely` answer
        most_likely[word] = spell.correction(word)

        # Get a list of `likely` options
        candidats[word] = spell.candidates(word)

    return (("The errors in the sentence are : {} ".format(misspelled)),
            ("The most likely answer : {}".format(most_likely)),
            ("The other likely options  : {}".format(candidats)))
Esempio n. 21
0
def spell_check():
    spell = SpellChecker()
    spell.word_frequency.load_words(known)
    num = 0
    for line in sys.stdin.readlines():
        num += 1
        words = list(
            filter(
                lambda w: w.lower() not in known and w[0] not in "'0123456789"
                and len(w) > 4 and all(ord(c) < 128 for c in w),
                re.findall(r"[\w']+", line)))
        known.update(map(lambda w: w.lower(), words))
        print('#', num, end='\r')
        time.sleep(0.002)
        for identifier in words:
            misspelled = spell.unknown(
                re.split(r"(?<=[a-z0-9])(?=[A-Z])|_|(?<=[A-Z])(?=[A-Z][a-z])",
                         identifier))
            # Get the one `most likely` answer
            for word in misspelled:
                print('#', num, identifier, '\t',
                      "s/%s/%s" % (word, spell.correction(word)))
Esempio n. 22
0
def spellcheck(input_file):

    # Read word set from input_file
    with open(input_file, "r") as file:
        word_list = file.read().split("\n")
    # print(content_list)

    spell = SpellChecker()

    # find those words that may be misspelled
    misspelled = spell.unknown(word_list)

    index = 0
    changes = []

    for word in misspelled:
        if word != "":

            corrected = ""
            s = str(index) + ". " + word

            if not show_only_words:
                corrected = spell.correction(word)
                # Fetch the best autocorrect
                s += " --> " + corrected
                # Fetch a list of other potential spell options (optional)
                if multiple_recommendations:
                    s += "; " + str(
                        list(spell.candidates(word))[:max_recommendations])

            if not only_show_words_with_recommendations or word != corrected:
                changes.append(s)
                index += 1

    # Write results to file output_file
    with open(input_file.split(".")[0] + "_output" + ".txt", 'w') as writer:
        for c in changes:
            writer.write(c + "\n")
Esempio n. 23
0
def main():

    spell = SpellChecker()
    with open(CLASSES_PATH, 'r') as f:
        class_map = json.load(f)

    print('Number of initial classes: ', len(class_map.keys()))
    # Remove numbers
    for word in class_map.keys():
        is_alpha_word = ''.join(e for e in word.lower() if not e.isdigit())
        class_map[word] = is_alpha_word

    print(
        len(set(class_map.values())),
        ' number of classes remaining after removing special chars and numbers.'
    )
    # find those words that may be misspelled
    misspelled = spell.unknown(list(class_map.values()))

    for i, word in enumerate(list(class_map.keys())):
        cleaned_word = class_map[word]

        # Update the class mapping if the cleaned word is misspelled according to SpellChecker
        if cleaned_word in misspelled:
            # Get the one `most likely` answer
            correction = spell.correction(cleaned_word)
            class_map[word] = correction

        if i % 500 == 0 and i != 0:
            print(i)

    with open('cleaned_classes.json', 'w') as f:
        json.dump(class_map, f, indent=4)

    print(
        len(set(class_map.values())),
        ' number of classes remaining after removing spelling errors according to SpellChecker.'
    )
def error_grammar_frequency_en(text):
    """

    :param text: tout le texte du mail
    :return:
    """

    spell = SpellChecker()

    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    size_txt = len(tokens)

    misspelled = spell.unknown(tokens)

    # frequency of the word who have are misspelled
    try:
        freq = len(misspelled)/size_txt
    except:
        freq = 0.5

    # transform in percent
    return freq*100
Esempio n. 25
0
def preprocess_reviews(reviews):
  code = ("expertiza","travser","questionnare","realestatemodel","getrubricarray","questarray","curquestionnaire","iquiry","questionnarecontroller","vmquestionresponse","capybaraerror","interfce","reportstrategy","visiualisation","potentialbuyer","sepc","nilclass","supplementaryreviewquestionnaire","pannernode","realestatecompany","codeclimate","househunter","offscreencanvas","herokuapp","webaudio","nomethoderror","constantsourcenode","quesparams","assignmentparticipant","bluetoothdiscoverysession","webidl","offscreenrenderingcontext","offscreencanvasrenderingcontext","oscillatornode","rpec","gemfile","rubocop","requesteduser","travisci","lowfi","bacui","importfilecontroller","metareview","oodd","assignmentteam","selfreview","amazonaws","agenttourlist","signuptopic","functionalites","rubymine","setvaluecurveattime","gradinghistory","simplecov","bluetoothadapter","issuecomment","tourscontroller","metareviews","umls","webbluetooth","scenairos","getrole","rspecs","agenttest","potentialbuyercontroller","customeroptions","standarderror","nameurl","assignmentcontroller","addcreatetopicsection","createconstantsource","baseaudio","reviewbid","reviewbidcontroller","rscore","assignmentscontroller","adminpanel","testfolder","potentialbuyers","memebershave","stereocontext","droptopicdeadline","getphoto","applicationcontroller","searchcontroller","responsecontroller","generatereport","hasallprivilegesof","htmlcanvascontextrendering","webglrenderingcontext","reviewresponsemap","timezonepref","gradescontroller","gemlock","appendcreatetopicsection","offscreencanvascontextrendering","deletedtour","mdfile","getlocal","parsererror","reviewmapping","oodesign","ooddprogram","coverrange","usinggit","gitgit","herokuwould","navebar","numericality","repositoryhttps","edgecases","expertzia","metareviewer","sqlexception","experitza","gdrive","assignmentquestionnaire","questionnairecontroller","setcurrentvalueattime","uncaughtthrowerror","customerbookingscontroller","runningbundle","createhouseinformation","exertiza","staticpagescontroller","createtopic","addtopic","tourmanagement","functionalties","devcenter","googleuser","applicationrecord","factorybot","ereadme","inquirydetails","funtionalities","existassignment","modelsand","baseaudiocontext","constantsourceoptions","foreignmodel","bookmarkratingresponsemap","bookmarkratingquestionnaire","degin","audioparam","signupsheetcontroller","screencase","participantsuper","setposition","setorientation","waveshapernode","biquadfilternode","betahttps","gitusing","isrealtor","ishousehunter","addquestionnairetablerow","popupcontroller","hasmany","hasone","debugbranch","userscontroller","userr","heatgrid","architcture","flowchats","interfact"
  )
  student = ("kunalnarangtheone","ychen","stereopannernode","swivl","ibwsfrvjmiytql","slwhv","iucqq","sidekiq","yzhu","nilaykapadia","jasminewang","bebacc","skannan","rustfmt","ocde","drupadhy","ajain","amody","upadhyaydevang","henlo","txmwju","kqbvycku","bdxxa","rxsun","bmyvjy","rommsw","travisbuddy","hhharden","appveyor","rahulsethi","rshakya","ziwiwww","nikitaparanjape","hounse","tourid","probablty","myaccounts","nainly","flazzle","folls","dhamang","dfef","afbc","eqsy","impliescode","jwarren","dodn","ferjm","jisx","coulhasdn","cbbdf","partipant","jwboykin","amogh","agnihotri","fdea","rbit","rbdoes","pronciple","sbasnet","kvtmnznc","ppvasude","ceec","edabe","namig","pptn","explainationit","urswyvyc"
  )
  spell = SpellChecker()
  for i in range(len(reviews)):
    reviews[i] = re.sub(r'[^a-zA-Z0-9\s]',' ',reviews[i]) # Removing special character
    reviews[i] = re.sub('\'',' ',reviews[i]) # Removing quotes
    reviews[i] = re.sub('\d',' ',reviews[i]) # Replacing digits by space
    reviews[i] = re.sub(r'\s+[a-z][\s$]', ' ',reviews[i]) # Removing single characters and spaces alongside
    reviews[i] = re.sub(r'\s+', ' ',reviews[i]) # Replacing more than one space with a single space
    if 'www.' in reviews[i] or 'http:' in reviews[i] or 'https:' in reviews[i] or '.com' in reviews[i]:
          reviews[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", reviews[i])
    reviews[i] = reviews[i].lower()
    for word in reviews[i].split():
      if word in code:
        reviews[i] = reviews[i].replace(word,"code")
      elif word in student:
        reviews[i] = reviews[i].replace(word,"student")  
      elif(bool(spell.unknown([word]))):
        recommended = spell.correction(word)
        print(recommended)
Esempio n. 26
0
def translate():
    translator = Translator()
    spell = SpellChecker(language=u'es')

    print("Enter word to translate. Enter ! when you are done.")

    #wordclasses = ['noun','verb','adjective','adverb','pronoun','preposition','conjunction','determiner','exclamation']

    #while True:

    # find those words that may be misspelled
    word = str(input()).lower()
    misspelled = spell.unknown([word])
    if len(misspelled) != 0:
        candidates = spell.candidates(word)
        s = ''
        for cand in candidates:
            s += cand + ", "
        s = s[:-2]
        print("The word was not found. Suggestions: " + s)

    trans = translator.translate(word, dest='en',
                                 src='es').extra_data['all-translations']
Esempio n. 27
0
def spell_checker(text):
    split_sen = []
    sentence = [text]
    split_sen = [i for item in sentence for i in item.split()]
    spell = SpellChecker()

    # find those words that may be misspelled
    misspelled = spell.unknown(split_sen)
    suggestions = []
    likely_correct = []
    for word in misspelled:
        # Get the one `most likely` answer
        likely_correct.append(spell.correction(word))

        # Get a list of `likely` options
        suggestions.append(spell.candidates(word))
    a_dict = dict()

    a_dict['suggestions'] = suggestions
    a_dict['misspelled'] = misspelled
    a_dict['likely_correct'] = likely_correct

    return json.dumps(a_dict, default=set_default)
def check_spelling(
    input_text_or_list: Union[str, List[str]],
    lang='en',
    ignore_word_file_path: Union[str, Path] = _IGNORE_SPELLCHECK_WORD_FILE_PATH
) -> str:
    """ Check and correct spellings of the text list """
    if input_text_or_list is None or len(input_text_or_list) == 0:
        return ''
    spelling_checker = SpellChecker(language=lang, distance=1)
    # TODO: add acronyms into spell checker to ignore auto correction specified by _IGNORE_SPELLCHECK_WORD_FILE_PATH
    spelling_checker.word_frequency.load_text_file(ignore_word_file_path)
    if isinstance(input_text_or_list, str):
        if not input_text_or_list.islower():
            input_text_or_list = input_text_or_list.lower()
        tokens = word_tokenize(input_text_or_list)
    else:
        tokens = [
            token.lower() for token in input_text_or_list
            if token is not None and len(token) > 0
        ]
    misspelled = spelling_checker.unknown(tokens)
    for word in misspelled:
        tokens[tokens.index(word)] = spelling_checker.correction(word)
    return ' '.join(tokens).strip()
Esempio n. 29
0
def spellcheck():
    if not session.get('logged_in'):
        return redirect(url_for('login'))

    if (request.method == 'POST'):
        spell = SpellChecker()

        try:
            file = request.files['wrongText']
        except:
            file = None

        if (file):
            filename = secure_filename(file.filename)
            logger.info('User spell checked: "' + filename + '"')

            result = " "

            #Read the file, split into words, decode from bytes to string
            words = file.read()
            words = words.decode("utf-8")
            words = words.split()

            mispelled = spell.unknown(words)
            if (len(mispelled) > 0):
                for w in mispelled:
                    result = result + ", " + w
            else:
                result = "No Typos here!"

            return render_template('spellcheck.html', webresult=result)
        else:
            return render_template('spellcheck.html',
                                   webresult="No Typos here")

    return render_template('spellcheck.html', webresult="None So Far!")
Esempio n. 30
0
class NLPUtils:
    def __init__(self, language):
        ## declaration and initialization
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

        self.spell = SpellChecker()
        ## English stopper words
        ", ".join(stopwords.words('english'))
        self.STOPWORDS = set(stopwords.words('english'))

    ## remove stop words
    def remove_stopwords(self, text):
        """custom function to remove the stopwords"""
        return " ".join(
            [word for word in str(text).split() if word not in self.STOPWORDS])

    ## Word stemmer
    def stem_words(self, text):
        return " ".join([self.stemmer.stem(word) for word in text.split()])

    ## word Lemmatization
    def lemmatize_words(self, text):
        return " ".join(
            [self.lemmatizer.lemmatize(word) for word in text.split()])

    ## Correction of spelling
    def correct_spellings(self, text):
        corrected_text = []
        misspelled_words = self.spell.unknown(text.split())
        for word in text.split():
            if word in misspelled_words:
                corrected_text.append(self.spell.correction(word))
            else:
                corrected_text.append(word)
        return " ".join(corrected_text)
class Incubator:
	#Class variables:
	#	sample_block_table: A dictionary containing all blocks in sample_path and their frequency
	#	spellchecker: A pyspellchecker instance with all the words in words_path added
	#	population: Total population of the incubator, indicating how many chromosomes exist at one time
	#	elites: How many elites are carried over for each generation
	#	children: How many children are created for each generation
	#	randoms: How many random chromosomes are added each generation
	#	tournament_size: How many chromosomes are considered in a tournament
	#	cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one
	#	mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one
	#	shock_enabled: True if genetic shock enabled, false otherwise
	#	shock_threshold: Number of cycles of fitness stagnation before genetic shock is triggered.
	#	max_cycles: Cycle # at which the simulation terminates
	def __init__(self, sample_path, words_path, elites, children, randoms, tournament_size, cross_chance, mutation_chance, shock_value, max_cycles):
		#Parameters:
			#	sample_path: A path to a samples source file containing all training data to be fed to the incubator
			#	words_path: A path to all words which the cipher_breaker should consider valid in addition
			#		to those already in pyspellchecker.
			#	elites: How many elites are carried over for each generation
			#	children: How many children are created for each generation
			#	randoms: How many random chromosomes are added each generation
			#	tournament_size: How many chromosomes are considered in a tournament
			#	cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one
			#	mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one
			#	shock_value: 0 if genetic shock disabled. Otherwise shock is enabled and shock_threshold is set to shock_value
			#	max_cycles: Cycle # at which the simulation terminates

			#Initializes sample_block_tables
			self.sample_block_table = self.getSampleBlockTable(sample_path)

			#Initializes spellchecker
			self.spellchecker = SpellChecker()
			self.spellchecker.word_frequency.load_text_file((words_path))

			#Checks cross_chance and mutation_chance are valid
			assert (cross_chance + mutation_chance) == 1

			#Loads all incubator paramaters
			self.elites = elites
			self.children = children
			self.randoms = randoms
			self.population = self.elites + self.children + self.randoms

			self.tournament_size = tournament_size

			self.cross_chance = cross_chance
			self.mutation_chance = mutation_chance

			#Handles shock_value
			if shock_value <= 0:
				self.shock_enabled = False
				self.shock_threshold = 0
			else:
				self.shock_enabled = True
				self.shock_threshold = shock_value

			self.max_cycles = max_cycles

			#Prints incubator summary if verbose enables
			if __VERBOSE__:
				print("Incubator Summary:")
				print("sample_path: " + sample_path + "  words_path: " + words_path)
				print("Total population: " + str(self.population))
				print("Elites: " + str(self.elites) + "  Children: " + str(self.children) + "  Randoms: " + str(self.randoms))
				print("Tournament size: " + str(self.tournament_size) + "  Cross chance: " + str(self.cross_chance) + "  Mutation chance: " + str(self.mutation_chance))
				print("Shock enabled: " + str(self.shock_enabled) + "  Shock threshold: " + str(self.shock_threshold))
				print("Max cycles: " + str(self.max_cycles))
				print("\n")

	"""TRAINING FUNCTIONS"""
	#Takes ciphertext, returns a chromosome that should decrypt ciphertext
	def train(self, cipher_text):
		#Initializes cycle counter
		cycles = 0

		#Generates pool of chromosomes
		chromosomes = []

		for chromosome_iter in range(self.population):
			chromosomes.append(self.getRandomChromosome())

		#Genetic shock trigger variables. Triggers if fitness is stagnant for shock_threshold cycles
		best_fitness = 0
		shock_ticker = 0

		#Starts timer
		start_time = time.time()

		while True:
			#Increments cycle counter
			cycles += 1

			#Creates list of (chromosome, fitness) tuples in order of increasing fitness
			chromosome_fitness = []

			#Checks all chromosomes to see if the correct one has been found
			for chromosome in chromosomes:
				if len(self.spellchecker.unknown((chromosome.convertText(cipher_text)).split(" "))) == 0:
					if __VERBOSE__:
						print("Found key! " + str(chromosome))
						print("Decrypted text:  " + chromosome.convertText(cipher_text))
						print("")

						return (chromosome, cycles)

			#Gets fitness of each chromosome and sorts them according to fitness
			for chromosome in chromosomes:
				chromosome_fitness.append((chromosome, self.getFitness(chromosome, cipher_text)))

			chromosome_fitness.sort(key=lambda x: x[1])
			chromosome_fitness.reverse()

			#Checks if max_cycles exceeded. If so, returns the fittest chromosome
			if cycles >= self.max_cycles:
				print("Best Key: " + str(chromosome_fitness[0][0]))
				print("Decrypted text:  " + chromosome_fitness[0][0].convertText(cipher_text))
				print("")
				return (chromosome_fitness[0][0], cycles)

			#Checks if fitness is stagnant
			if chromosome_fitness[0][1] <= best_fitness:
				shock_ticker += 1
			else:
				best_fitness = max(chromosome_fitness[0][1], best_fitness)
				shock_ticker = 0

			#If __VERBOSE__, provide report on most fit chromosome
			if __VERBOSE__:
				converted_text = chromosome_fitness[0][0].convertText(cipher_text)
				print("Cycle# " + str(cycles))
				print("Best Chromosome: " + str(chromosome_fitness[0][0]))
				print("Fitness: " + str(chromosome_fitness[0][1]))
				print("Shock Ticker: " + str(shock_ticker))
				print("Cycle Time: " + str(time.time()-start_time))
				print("Attempted Decrypt: " + converted_text)
				print("Known words: " + str(self.spellchecker.known((chromosome_fitness[0][0].convertText(cipher_text).split(" ")))))
				print("Unknown words: " + str(self.spellchecker.unknown((chromosome_fitness[0][0].convertText(cipher_text).split(" ")))))
				print("")

			start_time = time.time()

			#Creates a new chromosomes list
			new_chromosomes = []

			#Copies over elite to new chromosomes
			for chromosome_iter in range(self.elites):
				new_chromosomes.append(chromosome_fitness[chromosome_iter][0].clone())

			#Creates children in new_chromsomes

			#Performs tournament process to select breeding candidates
			tournament_selections = []
			while len(tournament_selections) < (self.children):
				tournament_selections.append(self.tournament(chromosome_fitness))

			#Breeds selected candidates
			while len(tournament_selections)>0:
				chance = random.random()
				if chance < self.cross_chance and len(tournament_selections) > 1:
					chromosome_one = tournament_selections.pop()
					chromosome_two = tournament_selections.pop()

					crossed_chromosomes = self.crossChromosomes(chromosome_one, chromosome_two)

					new_chromosomes.append(crossed_chromosomes[0])
					new_chromosomes.append(crossed_chromosomes[1])
				elif chance < (self.mutation_chance + self.cross_chance):
					new_chromosomes.append(self.mutateChromosome(tournament_selections.pop()))
				else:
					new_chromosomes.append(self.getRandomChromosome())

			#Adds random chromosomes to new_chromosomes
			for random_iter in range(self.randoms):
				new_chromosomes.append(self.getRandomChromosome())

			#Checks if genetic shock should be triggered
			if shock_ticker >= self.shock_threshold and self.shock_enabled:
				if __VERBOSE__:
					print("Triggering genetic shock...\n")

				#Performs genetic shock, culling top 10% of population and mutation all others
				for chromosome_iter in range(len(new_chromosomes)):
					if self.getFitness(new_chromosomes[chromosome_iter], cipher_text) > .9 * best_fitness:
						new_chromosomes[chromosome_iter] = self.getRandomChromosome()
					else:
						new_chromosomes[chromosome_iter] = self.mutateChromosome(new_chromosomes[chromosome_iter])

				#Resets shock tickers and trackers
				shock_ticker = 0
				best_fitness = 0

			#Shifts new_chromosomes into gene pool
			chromosomes = new_chromosomes

	#Returns a mutated chromosome
	def mutateChromosome(self, chromosome):
		new_chromosome = chromosome.clone()

		#Chooses two mappings to swap
		mutation_one_index = random.randint(0,25)
		mutation_two_index = random.randint(0,25)

		while mutation_two_index == mutation_one_index:
			mutation_two_index = random.randint(0,25)

		mutation_one = new_chromosome.mappings[mutation_one_index]
		mutation_two = new_chromosome.mappings[mutation_two_index]

		new_chromosome.removeMapping(mutation_one)
		new_chromosome.removeMapping(mutation_two)

		mapping_one = (mutation_one[0], mutation_two[1])
		mapping_two = (mutation_two[0], mutation_one[1])

		new_chromosome.addMapping(mapping_one)
		new_chromosome.addMapping(mapping_two)

		return new_chromosome

	#Takes two chromosomes and returns two crosses of those chromosomes in the format (new_chromosome_one, new_chromosome_two)
	def crossChromosomes(self, chromosome_one, chromosome_two):
		new_chromosome_one = chromosome_one.clone()
		new_chromosome_two = chromosome_two.clone()

		for chromosome_iter in range(26):
			if(random.random() > .5):
				old_mapping_one = new_chromosome_one.mappings[chromosome_iter]
				old_mapping_two = new_chromosome_two.mappings[chromosome_iter]

				if old_mapping_one != old_mapping_two:
					complement_mapping_one = new_chromosome_one.getMappingTarget(old_mapping_two[1])
					complement_mapping_two = new_chromosome_two.getMappingTarget(old_mapping_one[1])

					old_origin_one = complement_mapping_one[0]
					old_origin_two = complement_mapping_two[0]

					new_chromosome_one.removeMapping(complement_mapping_one)
					new_chromosome_two.removeMapping(complement_mapping_two)

					new_chromosome_one.removeMapping(old_mapping_one)
					new_chromosome_two.removeMapping(old_mapping_two)

					complement_mapping_one = (old_origin_two, complement_mapping_one[1])
					complement_mapping_two = (old_origin_one, complement_mapping_two[1])

					new_chromosome_one.addMapping(old_mapping_two)
					new_chromosome_one.addMapping(complement_mapping_two)
					new_chromosome_two.addMapping(old_mapping_one)
					new_chromosome_two.addMapping(complement_mapping_one)

		return (new_chromosome_one, new_chromosome_two)

	#Returns a new random chromosome
	def getRandomChromosome(self):
		new_chromosome = Chromosome()

		origin = []
		destination = []

		for letterIter in range(26):
			origin.append(chr(letterIter+97))
			destination.append(chr(letterIter+97))

		random.shuffle(destination)

		for mappingIter in range(26):
			new_chromosome.addMapping((origin[mappingIter], destination[mappingIter]))

		return new_chromosome

	#Performs a tournament selection of chromosomes based on tournament_size
	def tournament(self, chromosome_fitness):
		tournament_pool = []

		for tournament_iter in range(self.tournament_size):
			tournament_pool.append(chromosome_fitness[random.randint(0, self.population-1)])

		return (max(tournament_pool, key=lambda x: x[1]))[0].clone()

	#Takes a chromosome and cipher_text and evaluates the chromosomes fitness
	def getFitness(self, chromosome, cipher_text):
		total_fitness = 0
		parsed_block_table = self.getBlockTable(chromosome.convertText(cipher_text))

		for block in parsed_block_table.keys():
			if block in self.sample_block_table.keys():
				total_fitness += math.log(self.sample_block_table[block],2)*(parsed_block_table[block])

		return total_fitness

	"""
	BLOCK FUNCTIONS
	"""
	#Returns the blocks located in the passed samples path.
	def getSampleBlockTable(self, sample_path):
		#Opens input file
		input_file = open(sample_path)
		block_table = {}

		for line in input_file:
			components = line.split(" ")

			components[1] = int(components[1][0:len(components[1])-1])

			block_table[components[0]] = components[1]

		input_file.close()

		return block_table

	#Takes a string and returns a hash table of blocks
	def getBlockTable(self, input_string):
		block_table = {}
		input_words = input_string.split(" ")

		#Hashes blocks in dictionary to count them
		for word in input_words:
			word_blocks = self.getBlocks(word)

			for block in word_blocks:
				if block in block_table:
					block_table[block] += 1
				else:
					block_table[block] = 1

		return block_table

	#Returns all substrings of a passed string
	def getBlocks(self, input_string):
		blocks = []

		for block_len in range(len(input_string)):
			start_point = 0
			end_point = block_len+1

			while end_point <= len(input_string):
				blocks.append(input_string[start_point:end_point])
				end_point+=1
				start_point+=1

		return blocks
Esempio n. 32
0
#!/usr/bin/env python

'''
pip install pyspellchecker
'''

from spellchecker import SpellChecker
spell = SpellChecker()

# find those words that may be misspelled
misspelled = spell.unknown(['let', 'us', 'wlak','on','the','groun'])

for word in misspelled:
    # Get the one `most likely` answer
    print(spell.correction(word))

    # Get a list of `likely` options
    print(spell.candidates(word))