class SpellChecker():
    def __init__(self, max_distance, channel_model=None, language_model=None):
        self.nlp = spacy.load('en', pipeline=["tagger", "parser"])
        self.max_distance = max_distance
        # self.load_channel_model(channel_model)
        # self.load_language_model(language_model)

    def load_channel_model(self, fp):
        self.channel_model = EditDistanceFinder()
        self.channel_model.load(fp)

    def load_language_model(self, fp):
        self.language_model = LanguageModel()
        self.language_model.load(fp)

    def bigram_score(self, prev_word, focus_word, next_word):
        prevFocusScore = self.language_model.bigram_prob(prev_word, focus_word)
        focusNextScore = self.language_model.bigram_prob(focus_word, next_word)
        return (prevFocusScore + focusNextScore) / 2

    def unigram_score(self, word):
        return self.language_model.unigram_prob(word)

    def cm_score(self, error_word, corrected_word):
        return self.channel_model.prob(error_word, corrected_word)

    def inserts(self, word):
        '''
            Takes in word and return a list of words that are within one insert of word
        '''
        # Insert every letter
        possibleWords = []
        for letter in string.ascii_lowercase:
            # Every possible position
            for i in range(len(word) + 1):
                # Check if the resulting word is a word
                testWord = word[:i] + letter + word[i:]
                if self.language_model.__contains__(testWord):
                    possibleWords.append(testWord)
        return possibleWords

    def deletes(self, word):
        # Delete every letter
        possibleWords = []
        for i in range(len(word) + 1):
            # Check if the resulting word is a word
            testWord = word[:i] + word[i + 1:]
            if self.language_model.__contains__(testWord):
                possibleWords.append(testWord)
        return possibleWords

    def substitutions(self, word):
        # Substitute every letter
        possibleWords = []
        for letter in string.ascii_lowercase:
            # At every possible position
            for i in range(len(word) + 1):
                # Check if the resulting word is a word
                testWord = word[:i] + letter + word[i + 1:]
                if self.language_model.__contains__(testWord):
                    possibleWords.append(testWord)
        return possibleWords

    def generate_candidates(self, word):
        '''
            Takes in a candidate word and returns words that are within self.max_distance edits of word
        '''
        for i in range(1, self.max_distance + 1):
            if i == 1:
                candidateWords = self.inserts(word) + self.deletes(
                    word) + self.substitutions(word)
            else:
                newWords = []
                for currentWord in candidateWords:
                    newWords += self.inserts(currentWord) + self.deletes(
                        currentWord) + self.substitutions(currentWord)
                candidateWords += newWords
        # Get rid of duplicates
        return list(set(candidateWords))

    def check_sentence(self, sentence, fallback=False):
        returnList = []
        for i in range(len(sentence)):
            if i == 0 and i == len(sentence) - 1:
                prevWord = '<s>'
                nextWord = '</s>'
            elif i == 0:
                prevWord = '<s>'
                nextWord = sentence[i + 1]
            elif i == len(sentence) - 1:
                nextWord = '</s>'
                prevWord = sentence[i - 1]
            else:
                prevWord = sentence[i - 1]
                nextWord = sentence[i + 1]
            word = sentence[i]
            # If it's in the language model, add just that word
            if self.language_model.__contains__(word):
                returnList.append([word])
            else:
                # Get all the candidates for that word
                candidates = self.generate_candidates(word)
                candidateList = []
                if candidates == [] and fallback:
                    candidateList = [word]
                else:
                    for candidate in candidates:
                        unigramScore = self.unigram_score(candidate)
                        bigramScore = self.bigram_score(
                            prevWord, candidate, nextWord)
                        languageScore = (0.5*unigramScore) + \
                            (0.5 * bigramScore)
                        candidateScore = languageScore + \
                            self.cm_score(word, candidate)

                        candidateList.append([candidate, candidateScore])

                    # Sort the list by the second element
                    candidateList.sort(key=lambda x: x[1], reverse=True)
                    # Remove the second element, and append
                    candidateList = [x[0] for x in candidateList]
                returnList += [candidateList]

        return returnList

    def check_text(self, text, fallback=False):
        '''
        take a string as input, tokenize and sentence segment it with spacy, and then return the concatenation of the result of calling check_sentence on all of the resulting sentence objects.
        '''
        tokens = self.nlp(text)
        sentences = list(tokens.sents)

        processedSentences = []
        for sentence in sentences:
            # Convert sentence into list of lowercase words
            wordList = sentence.text.split()
            wordList = [x.lower() for x in wordList]
            processedSentences.append(self.check_sentence(wordList, fallback))

        return processedSentences

    def autocorrect_sentence(self, sentence):
        '''
         take a tokenized sentence (as a list of words) as input, call check_sentence on the sentence with fallback=True, and return a new list of tokens where each non-word has been replaced by its most likely spelling correction.
        '''
        corrections = self.check_sentence(sentence, fallback=True)
        return [x[0] for x in corrections]

    def autocorrect_line(self, line):
        '''
             take a string as input, tokenize and segment it with spacy, and then return the concatenation of the result of calling autocorrect_sentence on all of the resulting sentence objects.
        '''

        tokens = self.nlp(line)
        sentences = list(tokens.sents)

        processedSentences = []
        for sentence in sentences:
            # Convert sentence into list of lowercase words
            wordList = sentence.text.split()
            if len(wordList) == 0:
                continue
            wordList = [x.lower() for x in wordList]
            processedSentences.append(self.autocorrect_sentence(wordList))

        return processedSentences

    def suggest_sentence(self, sentence, max_suggestions):
        '''
            take a tokenized sentence (as a list of words) as input, call check_sentence on the sentence, and return a new list where:
            Real words are just strings in the list
            Non-words are lists of up to max_suggestions suggested spellings, ordered by your model’s preference for them.
        '''
        sentenceCorrections = self.check_sentence(sentence)

        returnList = []
        for word in sentenceCorrections:
            if len(word) == 1:
                returnList += word
            else:
                returnList.append(word[:max_suggestions])

        return returnList

    def suggest_text(self, text, max_suggestions):
        '''
            take a string as input, tokenize and segment it with spacy, and then return the concatenation of the result of calling suggest_sentence on all of the resulting sentence objects
        '''
        tokens = self.nlp(text)
        sentences = list(tokens.sents)

        processedSentences = []
        for sentence in sentences:
            # Convert sentence into list of lowercase words
            wordList = sentence.text.split()
            wordList = [x.lower() for x in wordList]
            # Get rid of the period
            if wordList[-1][-1] == '.':
                wordList[-1] = wordList[-1][:-1]
            processedSentences.append(
                self.suggest_sentence(wordList, max_suggestions))

        return processedSentences
class SpellChecker():
    def __init__(self,
                 channel_model=None,
                 language_model=None,
                 max_distance=100):
        self.nlp = spacy.load("en", pipeline=["tagger", "parser"])
        self.channel_model = channel_model
        self.language_model = language_model
        self.max_distance = max_distance

    def load_channel_model(self, fp):
        self.channel_model = EditDistanceFinder()
        self.channel_model.load(fp)

    def load_language_model(self, fp):
        self.language_model = LanguageModel()
        self.language_model.load(fp)

    def bigram_score(self, prev_word, focus_word, next_word):
        """
        takes 3 words and returns the average bigram probability of the first and last pair
        """
        return (self.language_model.bigram_prob(prev_word, focus_word) +
                self.language_model.bigram_prob(focus_word, next_word)) / 2

    def unigram_score(self, word):
        """
        takes a word and returns the unigram probability
        """
        return self.language_model.unigram_prob(word)

    def cm_score(self, error_word, corrected_word):
        """
        gives the probability of a word having been transformed into a given erroneous form

        params
        ------
        error_word     - the observed misspelling
        corrected_word - the proposed corrected word

        returns
        -------
        prob           - the probability of the corrected word having been transformed into the error word
        """
        return self.channel_model.prob(error_word, corrected_word)

    def inserts(self, word):
        wordsFound = []
        wordLen = len(word)
        for v in self.language_model.vocabulary:
            if v.isalpha() and len(v) == (wordLen + 1):
                if self.subseq(word, v):
                    wordsFound.append(v)
        return wordsFound

    def subseq(self, word1, word2):
        """
        returns true if word1 is a subsequence of word2  
        """
        for i in range(len(word1)):
            if (word1[i] not in word2):
                return False
            else:
                index = word2.index(word1[i])
                word2 = word2[index + 1:]
        return True

    def deletes(self, word):
        wordsFound = []
        wordLen = len(word)
        for v in self.language_model.vocabulary:
            if v.isalpha() and len(v) == (wordLen - 1):
                if self.subseq(v, word):
                    wordsFound.append(v)
        return wordsFound

    def substitutions(self, word):
        """
        take a word as input and return a list of words (that are in the LanguageModel) that are 
        within one substitution of word.
        """
        subList = []
        wordLen = len(word)

        for candidate in self.language_model.vocabulary:
            if candidate.isalpha() and len(candidate) == wordLen:
                for i in range(wordLen):
                    candidateDel = candidate[:i] + candidate[i + 1:]
                    wordDel = word[:i] + word[i + 1:]
                    if candidateDel == wordDel:
                        if candidate not in subList:
                            subList.append(candidate)
                        break

        return subList

    def transpositions(self, word):
        """
        take a word as input and return a list of words in LanguageModel that 
        are within one substitution of the word.
        """
        wordsFound = []
        wordLen = len(word)
        for i in range(wordLen - 2):
            transp = word[0:i] + word[i + 1] + word[i] + word[i + 2:]
            if transp in self.language_model:
                wordsFound.append(transp)
        return wordsFound

    def generate_candidates(self, word):
        """
        returns a list of words within max_distance edits of the given word
        """
        words = {word}
        for i in range(self.max_distance):
            # find all words within edit distance 1 of the words currently in words
            new_words = set()
            for candidate in words:
                new_words |= set(self.inserts(candidate)) | set(
                    self.deletes(candidate)) | set(
                        self.substitutions(candidate)) | set(
                            self.transpositions(candidate))
            words |= new_words
        if word not in self.language_model:  # we started with word to generate first set of candidates, but we don't want it in the final return if it isn't actually a word
            words.remove(word)
        return list(words)

    def check_non_words(self, sentence, fallback=False):
        words = []
        for i in range(len(sentence)):
            if sentence[i] in self.language_model:
                words.append([sentence[i]])
            else:
                candidates = self.generate_candidates(sentence[i])
                prev_word = '<s>' if i == 0 else sentence[i - 1]
                next_word = '</s>' if i == len(sentence) - 1 else sentence[i +
                                                                           1]
                candidates.sort(key=lambda x: 0.7 * (0.7 * self.bigram_score(
                    prev_word, x, next_word) + 0.3 * self.unigram_score(x)) +
                                0.3 * self.cm_score(sentence[i], x),
                                reverse=True)
                if fallback and not candidates:
                    candidates = [sentence[i]]
                words.append(candidates)
        return words

    def check_sentence(self, sentence, fallback=False):
        sentList = [
            ''.join([
                char for char in token.text if char in string.ascii_lowercase
            ]) for token in sentence
        ]
        sentList = [token for token in sentList if token]
        return self.check_non_words(sentList, fallback)

    def check_text(self, text, fallback=False):
        """
        takes a string as input, tokenize and sentence segment it with spacy, 
        and then return the concatenation of the result of calling check_sentence 
        on all of the resulting sentence objects.
        """
        self.nlp.tokenizer = Tokenizer(self.nlp.vocab)
        doc = self.nlp(text.lower())
        result = []
        for sent in doc.sents:
            correctionList = self.check_sentence(sent, fallback)
            result.extend(correctionList)
        return result

    def autocorrect_sentence(self, sentence):
        """Take a tokenized sentence (as a list of words) as input, 
        call check_sentence on the sentence with fallback=True, and 
        return a new list of tokens where each non-word has been
         replaced by its most likely spelling correction
        """
        words = self.check_sentence(sentence, True)
        newSentence = []
        for i in range(len(sentence)):
            newSentence.append(words[i][0])
        return newSentence

    def autocorrect_line(self, line):
        """Take a string as input, tokenize and segment it with spacy, 
        and then return the concatenation of the result
        of calling autocorrect_sentence on all of the resulting sentence objects.
        """
        checkLines = self.check_text(line, True)
        newSentence = []
        for i in range(len(checkLines)):
            newSentence.append(checkLines[i][0])
        return ' '.join(newSentence)

    def suggest_sentence(self, sentence, max_suggestions):

        words = self.check_sentence(sentence, True)
        newSentence = []
        for i in range(len(sentence)):
            if sentence[i].text in self.language_model:
                newSentence.append(sentence[i].text)
            else:
                newSentence.append(words[i][0:max_suggestions])
        return newSentence

    def suggest_text(self, text, max_suggestions):
        # checkLines = self.check_text(text, True)
        # newSentence = []
        # for i in range(len(checkLines)):
        #     if checkLines[i] in self.language_model:
        #         newSentence.append(checkLines[i])
        #     else:
        #         newSentence.append(checkLines[i][0:max_suggestions])
        # return newSentence
        self.nlp.tokenizer = Tokenizer(self.nlp.vocab)
        doc = self.nlp(text.lower())
        result = []
        for sent in doc.sents:
            # if sent.text in self.language_model:
            #     result.extend(sent.text)
            # else:
            correctionList = self.suggest_sentence(sent, max_suggestions)
            result.extend(correctionList)
        return result
 def load_channel_model(self, fp):
     self.channel_model = EditDistanceFinder()
     self.channel_model.load(fp)
Esempio n. 4
0
 def __init__(self, channel_model=None, language_model=None, max_distance=1):
     self.channel_model = channel_model if channel_model is not None else EditDistanceFinder()
     self.language_model = language_model if language_model is not None else LanguageModel()
     self.max_distance = max_distance
     self.char_set = list(string.ascii_lowercase)
class SpellChecker(object):

    def __init__(self, max_distance, channel_model=None, language_model=None):
        self.channel_model = channel_model
        self.language_model = language_model
        self.max_distance = max_distance
        self.nlp = spacy.load("en", pipeline=["tagger", "parser"])

    def load_channel_model(self, fp):
        self.channel_model = EditDistanceFinder()
        self.channel_model.load(fp)

    def load_language_model(self, fp):
        self.language_model = LanguageModel()
        self.language_model.load(fp)

    def bigram_score(self, prev_word, focus_word, next_word):
        score = lambda x, y: self.language_model.bigram_prob(x,y)
        return (score(prev_word, focus_word) + score(focus_word, next_word))/(2.0)

    def unigram_score(self, word):
        return self.language_model.unigram_prob(word)

    def inserts(self, word):
        l = []
        for i in range(len(word)):
            for char in string.ascii_lowercase:
                l.append(word[:i] + char + word[i:])
        return [x for x in l if x in self.language_model]

    def deletes(self, word):
        l = []
        for i in range(len(word)):
            l.append(word[:i] + word[i+1:])
        return [x for x in l if x in self.language_model]

    def substitutions(self, word):
        l = []
        for i in range(len(word)):
            for char in string.ascii_lowercase:
                l.append(word[:i] + char + word[i+1:])
        return [x for x in l if x in self.language_model]

    def transposes(self, word):
        l = []
        for i in range(1,len(word)):
            l.append(word[:i-1] + word[i] + word[i-1] + word[i+1:])
        return [x for x in l if x in self.language_model]

    def cm_score(self, error_word, corrected_word):
        return self.channel_model.align(error_word, corrected_word)[0]

    def generate_candidates(self, word):
        source = [word]
        for i in range(self.max_distance):
            nested = list(map(self._one_step, source))
            flat = [l for sublist in nested for l in sublist]
            source = list(set(flat))
        return source

    def check_sentence(self, sentence, fallback=False):
        l = []
        for i in range(len(sentence)):
            word = sentence[i]
            if (word in self.language_model) or (word in string.punctuation) or word == '\n':
                l.append([word])
            else:
                choices = self.generate_candidates(word)
                if len(choices) == 0:
                    if fallback:
                        l.append([word])
                else:
                    if i<1:
                        prev_word = '<s>'
                    else:
                        prev_word = sentence[i-1]

                    if i+1 == len(sentence):
                        next_word = '</s>'
                    else:
                        next_word = sentence[i+1]

                    #rank = lambda x: self.cm_score(x, word)
                    #rank = lambda x: self.bigram_score(prev_word, x, next_word)
                    rank = lambda x: self._combine_scores(self.cm_score(x, word), self.bigram_score(prev_word, x, next_word), self.unigram_score(x))
                    ranked = sorted(choices, key = rank, reverse=False)
                    l.append(list(ranked))


        return l

    def _combine_scores(self, cm_score, bigram_score,unigram_score):
        return cm_score - 0.5*(bigram_score+unigram_score)



    def _one_step(self, word):
        if transpose:
            return self.inserts(word) + self.deletes(word) + self.substitutions(word) + self.transposes(word)
        else:
            return self.inserts(word) + self.deletes(word) + self.substitutions(word)

    def autocorrect_sentence(self, sentence):
        options = self.check_sentence(sentence, fallback=True)
        return [x[0] for x in options]

    def suggest_sentence(self, sentence, max_suggestions):
        options = self.check_sentence(sentence)
        get = lambda x: x[0] if len(x) == 0 else x[:max_suggestions]
        return [get(x) for x in options]

    def check_text(self, text, fallback=False):
        func = lambda x: self.check_sentence(x, fallback)
        return self._spacy_map(text, func)

    def autocorrect_line(self, line):
        return self._spacy_map(line, self.autocorrect_sentence)

    def suggest_text(self, text, max_suggestions):
        func = lambda x: self.suggest_sentence(x, max_suggestions)
        return self._spacy_map(text, func)

    def _spacy_map(self, text, function):
        doc = self.nlp(text.lower())
        l = []
        for sentence in doc.sents:
            stringlist = [str(x) for x in sentence]
            l += function(stringlist)
        return l
class SpellChecker():
    def __init__(self, max_distance=1, channel_model=None, language_model=None):
        self.nlp = spacy.load("en", pipeline=["tagger", "parser"])
        self.channel_model = channel_model
        self.language_model = language_model
        self.max_distance = max_distance

    def load_channel_model(self, fp):
        self.channel_model = EditDistanceFinder()
        self.channel_model.load(fp)

    def load_language_model(self, fp):
        self.language_model = LanguageModel()
        self.language_model.load(fp)

    def bigram_score(self, prev_word, focus_word, next_word):
        if self.language_model:
            return (self.language_model.bigram_prob(prev_word, focus_word) +\
                    self.language_model.bigram_prob(focus_word, next_word))/2

    def unigram_score(self, word):
        if self.language_model:
            return self.language_model.unigram_prob(word)

    def cm_score(self, error_word, corrected_word):
        if self.channel_model:
            return self.channel_model.prob(error_word, corrected_word)

    def inserts(self, word):
        words = []
        for letter in string.ascii_lowercase:
            for i in range(len(word)+1):
                pos = word[:i] + letter + word[i:]
                if pos in self.language_model.vocabulary:
                    words.append(pos)
        return words

    def deletes(self, word):
        words = []
        for i in range(len(word)):
            pos = word[:i] + word[i+1:]
            if pos in self.language_model.vocabulary:
                words.append(pos)
        return words

    def substitutions(self, word):
        words = []
        for letter in string.ascii_lowercase:
            for i in range(len(word)):
                pos = word[:i] + letter +  word[i+1:]
                if pos in self.language_model.vocabulary:
                    words.append(pos)
        return words

    def transpositions(self, word):
        words = []
        for i in range(1, len(word)):
            pos = word[:i-1] + word[i] + word[i-1] + word[i+1:]
            if pos in self.language_model.vocabulary:
                words.append(pos)
        return words

    def generate_candidates(self, word):
        candidates = set()
        candidates.update(self.inserts(word))
        candidates.update(self.deletes(word))
        candidates.update(self.substitutions(word))
        d = self.max_distance - 1
        while d > 0:
            for word in candidates.copy():
                candidates.update(self.inserts(word))
                candidates.update(self.deletes(word))
                candidates.update(self.substitutions(word))
            d -= 1
        return list(candidates)

    def generate_candidates_optimized(self, word):
        return self.optimized_finder(word, self.max_distance)

    def score(self, prev_word, focus_word, next_word, observed_word):
        lang_score = 0.2*self.unigram_score(focus_word) + 0.8*self.bigram_score(prev_word, focus_word, next_word)
        return 0.7*lang_score + 0.3*self.cm_score(observed_word, focus_word)

    def check_sentence(self, sentence, fallback=False):
        suggestion = []
        for i in range(len(sentence)):
            observed_word = sentence[i]
            if observed_word.lower() in self.language_model or (len(observed_word) == 1 and observed_word not in string.ascii_lowercase):
                suggestion.append([observed_word])
                continue
            prev_word = None
            next_word = None
            if i == 0:
                prev_word = '<s>'
            else:
                prev_word = sentence[i-1]
            if i == len(sentence) - 1:
                next_word = '</s>'
            else:
                next_word = sentence[i+1]

            suggested = self.generate_candidates(observed_word)
            if fallback and len(suggested) == 0:
                suggested.append(observed_word)
            suggestion.append(
                sorted(
                    suggested,
                    key=lambda e: self.score(prev_word, e, next_word,
                        observed_word),
                    reverse=True
                )
            )
        return suggestion

    def get_tokens(self, sentence):
        return [x.text for x in sentence]

    def check_text(self, text, fallback=False):
        doc = self.nlp(text)
        result = []
        for sentence in doc.sents:
            tokens = self.get_tokens(sentence)
            result.append(self.check_sentence(tokens))
        return result

    def autocorrect_sentence(self, sentence):
        temp = self.check_sentence(sentence, True)
        result = []
        for token in temp:
            result.append(token[0])
        return result

    def autocorrect_line(self, line):
        doc = self.nlp(line)
        result = []
        for sentence in doc.sents:
            tokens = self.get_tokens(sentence)
            result.append(self.autocorrect_sentence(tokens))
        return '\n'.join([' '.join(sentence) for sentence in result])

    def suggest_sentence(self, sentence, max_suggestions):
        temp = self.check_sentence(sentence, True)
        result = []
        for token in temp:
            result.append(token[:max_suggestions])
        return result

    def suggest_text(self, text, max_suggestions):
        doc = self.nlp(text)
        result = []
        for sentence in doc.sents:
            tokens = self.get_tokens(sentence)
            result.append(self.suggest_sentence(tokens, max_suggestions))
        return result
class SpellChecker():

	def __init__(self, channel_model=None, language_model=None, max_distance):
		self.channel_model = channel_model
		self.language_model = language_model
		self.max_distance = max_distance

	def load_channel_model(self, fp):
		self.channel_model = EditDistanceFinder()
		self.channel_model.train(fp)

	def load_language_model(self, fp):
		self.language_model = LanguageModel()
		self.language_model.load(fp)

	def bigram_score(self, prev_word, focus_word, next_word):
		return (self.language_model.bigram_score(prev_word,focus_word) + self.language_model.bigram_score(focus_word,next_word))/2

	def unigram_score(self, word):
		return self.language_model.unigram_score(word):

	def cm_score(self, error_word, corrected_word):
		return self.channel_model.align(error_word,corrected_word)[0]

	@staticmethod
	def isSubstring(w1, w2):
		for letter in w1:
			try:
				w2 = w2[w2.index(letter):]
			except:
				return False
		return True

	def inserts(self, word):
		output = []
		for w in self.language_model:
			if len(w) == len(word) + 1:
				if isSubstring(word, w):
					output.append(w)
		return output

	def deletes(self, word):
		output = []
		for w in self.language_model:
			if len(w) == len(word) - 1:
				if isSubstring(w,word):
					output.append(w)
		return output

	def substitutions(self, word):
		output = []
		for w in self.language_model:
			if len(w) == len(word):
				numInc = 0
				for i in range(len(w)):
					if w[i] != word[i]:
						numInc += 1
				if numInc == 1:
					output.append(w)
		return output

	def generate_candidates(self, word):
		output = [word]
		for _ in range(self.max_distance):
			newOutput = []
			for w in output:
				newOutput += self.inserts(word) + self.deletes(word) + self.substitutions(word)
			output = newOutput

		return output

	def check_non_words(self, sentence, fallback=False):
		output = []
		for word in sentence:
			if word in self.language_model:
				output.append([word])
			else:
				L = self.generate_candidates(word)
				if fallback && len(L) == 0:
					output.append([word])
				else:
					L.sort(key=lambda w: self.language_model.unigram_score(w) + self.channel_model.align(w)[0])
					output.append(L)
		return output

	def check_sentence(self, sentence, fallback=False):
		return self.check_non_words(sentence, fallback)

	def check_text

	def check_sentence(self, sentence, fallback=False):
class SpellChecker:
    def __init__(self, max_distance, channel_model=None, language_model=None):
        self.nlp = spacy.load("en", pipeline=["tagger", "parser"])
        self.channel_model = channel_model
        self.language_model = language_model
        self.max_distance = max_distance
        self.punc = '.?:;"\'!\n,/\\'

    def load_channel_model(self, fp):
        self.channel_model = EditDistanceFinder()
        self.channel_model.load(fp)

    def load_language_model(self, fp):
        self.language_model = LanguageModel()
        self.language_model.load(fp)

    def bigram_score(self, prev_word, focus_word, next_word):
        # returns the average log prob between bigrams (prev, focus)
        # and (focus, next).
        return 0.5 * (self.language_model.bigram_prob(prev_word, focus_word) +
                      self.language_model.bigram_prob(focus_word, next_word))

    def unigram_score(self, word):
        # returns the log probability of this unigram
        return self.language_model.unigram_prob(word)

    def cm_score(self, error_word, corrected_word):
        # returns the log probability that error_word was typed when
        # corrected_word was intended.
        return self.channel_model.prob(error_word, corrected_word)

    def inserts(self, word):
        # returns a list of words that are within one insert of word
        # we try inserting each character in each position of the word
        l = []
        for i in range(len(word) + 1):
            for new_char in string.ascii_lowercase:
                new_word = word[0:i] + new_char + word[i:]
                if new_word in self.language_model:
                    l.append(new_word)
        return list(set(l))

    def deletes(self, word):
        # returns a list of words that are within one delete of word
        # we try deleting each character in the word
        l = []
        for i in range(len(word)):
            new_word = word[0:i] + word[i + 1:]
            if new_word in self.language_model:
                l.append(new_word)
        return list(set(l))

    def substitutions(self, word):
        # returns a list of words that are within one substitution of word
        # we try substituting each character in the word with every other char
        l = []
        for i, char in enumerate(word):
            for new_char in string.ascii_lowercase:
                new_word = word[0:i] + new_char + word[i + 1:]
                if char != new_char and new_word in self.language_model:
                    l.append(new_word)
        return l

    def transpositions(self, word):
        # returns a list of words that are within one transposition of word
        # we try transposing each pair of adjacent characters
        l = []
        for i in range(len(word) - 1):
            new_word = word[0:i] + word[i + 1] + word[i] + word[i + 2:]
            if new_word in self.language_model:
                l.append(new_word)
        return list(set(l))

    def generate_candidtates(self, word):
        # returns a list of words that are within max_distance
        # edits from the input word. We do this by first generating
        # the words that are 1 edit away, then the words that are 1
        # edit away from those, and so on.

        checked_word_list = []  # tracks words we've already checked
        words_to_check = [word]  # tracks words we need to check
        word_list = []  # tracks our final word list

        for _ in range(self.max_distance):

            new_words_list = []  # new words on this iteration

            for w in words_to_check:
                if w in checked_word_list:
                    continue  # we alreadly checked this word
                checked_word_list.append(w)

                # try deletion/insertion/substitution to find new words
                new_words_list.extend(self.inserts(w))
                new_words_list.extend(self.deletes(w))
                new_words_list.extend(self.substitutions(w))
                new_words_list.extend(self.transpositions(w))

            # add new unique words to our word list
            words_to_check = []
            for w in new_words_list:
                if w not in word_list:
                    word_list.append(w)
                    words_to_check.append(w)

        return word_list

    def sort_candidates(self, error_word, prev_word, next_word, candidates):
        """ takes as input a spelling error and a list of candidates
            and returns a sorted list of candidates, where earlier candidates
            are "better" suggestions, in terms of a weighted combination of
            unigram score, bigram score, and edit distance score.
            Note, our choice depends somewhat on the context of the word
        """
        score_list = []
        for candidate in candidates:
            bigram_score = self.bigram_score(prev_word, candidate, next_word)
            unigram_score = self.unigram_score(candidate)
            edit_score = self.cm_score(error_word, candidate)
            # we use an equally weighted linear combination of log edit score
            # and language model score.
            score = 0.5 * edit_score + 0.25 * (bigram_score + unigram_score)
            score_list.append((candidate, score))

        # sort list so that highest score comes first
        sorted_list = sorted(score_list, key=lambda x: -x[1])
        return [w for w, s in sorted_list]

    def check_non_words(self, sentence, fallback=False):
        """ Takes as input a list of words, and returns a list of lists
            of words. If the word is in the language model, the list
            contains just the original word. Otherwise, it contains a list
            of spell correcting suggestions. If fallback is true, we will 
            replace any word with no suggestions with the list of just
            the word itself.
        """
        l = []

        for i, word in enumerate(sentence):
            word = word.lower()  # enfore lowercase
            if word in self.language_model or word in self.punc:
                l.append([word])  # correctly spelled word/punctuation
            else:
                candidates = self.generate_candidtates(word)
                prevW = sentence[i - 1] if i > 0 else "<s>"
                nextW = sentence[i + 1] if i + 1 < len(sentence) else "</s>"
                canditates = self.sort_candidates(word, prevW, nextW,
                                                  candidates)

                if canditates or not fallback:
                    l.append(canditates)  # give candidate suggestions
                else:
                    l.append([word])  # fallback case, no candidates

        return l

    def check_sentence(self, sentence, fallback=False):
        """ Takes as input a list of words, and returns a list of 
            lists of words. Correctly spelled words appear in their own
            list; otherwise, a list of spelling corrections is given in
            order of likelihood.
        """
        return self.check_non_words(sentence, fallback=fallback)

    def check_line(self, line, fallback=False):
        """ Takes as input a string, tokenizes it, and returns a list of 
            lists of words. Correctly spelled words appear in their own
            list; otherwise, a list of spelling corrections is given in
            order of likelihood.
        """
        doc = self.nlp(line)  # use spacy to segment sentences
        l = []

        for sent in doc.sents:
            # genreate sentence as list of strings
            # ignore punctuation characters
            sentence = [str(w) for w in sent]
            # pass our sentence to the check_sentence method
            l.extend(self.check_sentence(sentence, fallback=fallback))

        return l

    def autocorrect_sentence(self, sentence):
        """ takes a list of tokens and returns a new list of tokens 
            where each non-word has been replaced by its most likely 
            spelling correction.
        """
        l = self.check_sentence(sentence, fallback=True)
        return [w[0] for w in l]

    def autocorrect_line(self, line):
        """ takes a string as input, tokenizes and segment it with spacy, 
            and then returns the concatenation of the result of calling 
            autocorrect_sentence on all of the resulting sentence objects
        """
        doc = self.nlp(line)  # use spacy to segment sentences
        l = []

        for sent in doc.sents:
            sentence = [str(w) for w in sent]
            l.extend(self.autocorrect_sentence(sentence))

        return l

    def suggest_sentence(self, sentence, max_suggestions):
        """ Takes as input a list of words, and returns a list of 
            words and lists of words. Correctly spelled words appear on their
            own; otherwise, a list of spelling suggestions is given in
            order of likelihood.
        """
        suggestions = self.check_sentence(sentence, fallback=True)
        l = []

        for i in range(len(sentence)):
            if sentence[i] in self.language_model or sentence[i] in self.punc:
                l.append(sentence[i])
            else:
                l.append(suggestions[i][0:max_suggestions])
        return l

    def suggest_line(self, line, max_suggestions):
        """ takes a string as input, tokenizes and segments it with spacy, 
            and then returns the concatenation of the result of calling 
            suggest_sentence on all of the resulting sentence objects
        """
        doc = self.nlp(line)  # use spacy to segment sentences
        l = []

        for sent in doc.sents:
            sentence = [str(w) for w in sent]
            l.extend(self.suggest_sentence(sentence, max_suggestions))

        return l
class SpellChecker():
    def __init__(self, max_distance, channel_model=None, language_model=None):
        self.nlp = spacy.load("en")
        self.channel_model = channel_model
        self.language_model = language_model
        self.max_distance = max_distance

    def load_channel_model(self, fp):
        self.channel_model = EditDistanceFinder()
        self.channel_model.load(fp)

    def load_language_model(self, fp):
        self.language_model = LanguageModel()
        self.language_model.load(fp)

    def bigram_score(self, prev_word, focus_word, next_word):
        return (self.language_model.bigram_prob(prev_word, focus_word) +
                self.language_model.bigram_prob(focus_word, next_word)) / 2

    def unigram_score(self, word):
        return self.language_model.unigram_prob(word)

    def cm_score(self, error_word, corrected_word):
        return self.channel_model.prob(error_word, corrected_word)

    def isSubstring(self, w1, w2):
        if w1 == "":
            return True
        elif w2 == "":
            return False
        else:
            if w1[0] == w2[0]:
                return self.isSubstring(w1[1:], w2[1:])
            else:
                return self.isSubstring(w1, w2[1:])

    def inserts(self, word):
        output = []
        for w in self.language_model.vocabulary:
            if len(w) == len(word) + 1:
                if self.isSubstring(word, w):
                    output.append(w)
        return output

    def deletes(self, word):
        output = []
        for w in self.language_model.vocabulary:
            if len(w) == len(word) - 1:
                if self.isSubstring(w, word):
                    output.append(w)
        return output

    def substitutions(self, word):
        output = []
        for w in self.language_model.vocabulary:
            if len(w) == len(word):
                numInc = 0
                for i in range(len(w)):
                    if w[i] != word[i]:
                        numInc += 1
                if numInc == 1:
                    output.append(w)
        return output

    def generate_candidates(self, word):
        output = []
        newWords = [word]
        for _ in range(self.max_distance):
            checkWords = []
            for w in newWords:
                if not all([x in string.ascii_lowercase for x in w]):
                    continue
                checkWords.extend(self.inserts(word))
                checkWords.extend(self.deletes(word))
                checkWords.extend(self.substitutions(word))
            output.extend(checkWords)
            newWords = checkWords

        return list(set(output))

    def sortList(self, wordList, prevWord, targetWord, nextWord):
        output = []
        for word in wordList:
            bs = self.bigram_score(prevWord, word, nextWord)
            us = self.unigram_score(word)
            cm = self.cm_score(targetWord, word)
            score = 0.5 * cm + 0.25 * bs + 0.25 * us
            output.append((word, score))
        output.sort(key=lambda w: w[1])
        return [w[0] for w in output]

    def check_non_words(self, sentence, fallback=False):
        output = []
        for i in range(len(sentence)):
            if sentence[i] in self.language_model:
                output.append([sentence[i]])
            else:
                L = self.generate_candidates(sentence[i])
                if fallback and len(L) == 0:
                    output.append([sentence[i]])
                else:
                    if i > 0:
                        prevWord = sentence[i - 1]
                    else:
                        prevWord = "<s>"
                    if i + 1 == len(sentence):
                        nextWord = "</s>"
                    else:
                        nextWord = sentence[i + 1]
                    self.sortList(L, prevWord, sentence[i], nextWord)
                    output.append(L)
        return output

    def check_sentence(self, sentence, fallback=False):
        return self.check_non_words(sentence, fallback)

    def check_line(self, line, fallback=False):
        sentences = self.nlp(line).sents
        output = []
        for sent in sentences:
            sentence = [str(w) for w in sent]
            output.extend(self.check_sentence(sentence, fallback=fallback))

        return output

    def check_text(self, text, fallback=False):
        sentences = self.nlp(text).sents
        output = []
        for sent in sentences:
            output.append(self.check_sentence(sent, fallback))
        return output

    def autocorrect_sentence(self, sentence):
        suggestions = self.check_sentence(sentence, True)
        return [word[0] for word in suggestions]

    def autocorrect_line(self, line):
        sentences = self.nlp(line).sents
        output = []

        for sent in sentences:
            sentence = [str(w) for w in sent]
            output.extend(self.autocorrect_sentence(sentence))

        return output

    def suggest_sentence(self, sentence, max_suggestions):
        suggestions = self.check_sentence(sentence, True)
        output = []

        for i in range(len(sentence)):
            if sentence[i] in self.language_model:
                output.append(sentence[i])
            else:
                output.append(suggestions[i][0:max_suggestions])

        return output

    def suggest_line(self, line, max_suggestions):
        sentences = self.nlp(line).sents
        output = []

        for sent in sentences:
            sentence = [str(w) for w in sent]
            output.extend(self.suggest_sentence(sentence, max_suggestions))

        return output

    def suggest_text(self, text, max_suggestions):
        sentences = self.nlp(text).sents
        output = []

        for sent in sentences:
            suggestion = self.suggest_sentence(sent, max_suggestions)
            for sug in suggestion:
                output.append(sug)

        return output
Esempio n. 10
0
                        bigram_score = self.bigram_score(sentence[current_index - 1], words[i], "<s>")
                    elif current_index == 0: # start of sentence
                        bigram_score = self.bigram_score("<s>", words[i], sentence[current_index + 1])
                    else:
                        bigram_score = self.bigram_score(sentence[current_index - 1], words[i], sentence[current_index + 1])
                    scores.append(self.cm_score(error_word, words[i]) + 0.5 * bigram_score + 0.5 * self.unigram_score(words[i]))
                    i += 1
                return_list.append(scores)
            current_index += 1
        return list_of_words, return_list

if __name__ == "__main__":
    lm = AddAlphaBigramModel(alpha=.1)
    lm.train()

    cm = EditDistanceFinder()
    cm.train("wikipedia_misspellings.txt")

    s = SpellChecker(cm, lm, 1)
    print (s.check_sentence(["how", "are", "yoo", "sir"]))
    print ()
    print (s.check_sentence(["they", "did", "not", "yb", "any", "menas"]))
    print ()
    print (s.autocorrect_sentence(["they", "did", "not", "yb", "any", "menas"]))
    print ()
    print (s.autocorrect_sentence(["menas"]))
    print ()
    print (s.__score_probabilities__(["they", "did", "not", "yb", "any", "menas"]))
    print ()
    print (s.suggest_sentence(["they", "did", "not", "yb", "any", "menas"], 3))
    print ()