Ejemplo n.º 1
0
 def __init__(self, max_edit_distance=2, dictionary=Dictionary(), 
              word_counts="data/count_brown_reuters.txt", 
              bigram_counts="data/count_2w_brown_reuters.txt"):
     
     """ Create a new Spellchecker
     
     Keyword arguments:
     max_edit_distance -- Maximum edit distance we should consider for words.
                          Anything above 2 is too computationally intensive for most systems.
     
     dictionary -- Dictionary to use
     word_counts -- Filename of word counts file
     bigram_counts -- Filename of bigram counts file
     """
     self.max_edit_distance = max_edit_distance
     self.dictionary = dictionary
     
     self.freq = FreqDist()
     self.freq.load(open(word_counts))
     
     self.bfreq = FreqDist()
     self.bfreq.load(open(bigram_counts))
     
     self.cmatrix = ConfusionMatrix()
Ejemplo n.º 2
0
#!/usr/bin/env python
"""
Count word bigrams in corpus and output to stdout

Uses both the Brown and Reuters corpus

@author: Johannes H. Jensen <*****@*****.**>
"""

if __name__ == '__main__':
    from nltk import bigrams
    from nltk.corpus import brown, reuters
    from corpus import FreqDist
    
    bs = bigrams(word.lower() for word in brown.words() + reuters.words() if word.isalpha())
    fdist = FreqDist(bs)
    print fdist.dumps()
Ejemplo n.º 3
0
class Spellchecker(object):
    """ Spellchecker which can process an arbitrary text """
    
    
    def __init__(self, max_edit_distance=2, dictionary=Dictionary(), 
                 word_counts="data/count_brown_reuters.txt", 
                 bigram_counts="data/count_2w_brown_reuters.txt"):
        
        """ Create a new Spellchecker
        
        Keyword arguments:
        max_edit_distance -- Maximum edit distance we should consider for words.
                             Anything above 2 is too computationally intensive for most systems.
        
        dictionary -- Dictionary to use
        word_counts -- Filename of word counts file
        bigram_counts -- Filename of bigram counts file
        """
        self.max_edit_distance = max_edit_distance
        self.dictionary = dictionary
        
        self.freq = FreqDist()
        self.freq.load(open(word_counts))
        
        self.bfreq = FreqDist()
        self.bfreq.load(open(bigram_counts))
        
        self.cmatrix = ConfusionMatrix()
    
    def prob(self, word, edit, context=None):
        """ Calculate probability of candidate word given edit info and context.
        
        Context can be one of (word_before, None), (None, word_after), 
        (word_before, word_after) or None.
        """
        p_word = self.freq.freq(word)
        p_edit = self.cmatrix.get_prob(edit) + 0.5
        p_context = 0.5
        
        if context:
            word_before, word_after = context
            
            if word_before or word_after:
                if word_before:
                    p_context += self.bfreq[(word_before, word)]
                if word_after:
                    p_context += self.bfreq[(word, word_after)]
        
        return p_word * p_edit * p_context
    
    def check(self, text):
        """ Check a text for spelling errors
        
        Returns a SpellcheckResult with information about misspelled words
        and candidates.
        """
        sents = [[w for w in word_tokenize(sent) if w.isalpha()] for sent in sent_tokenize(text)]
        
        results = SpellcheckResult(sents)
        
        # Loop through sentences, words
        for sentence_pos, sentence in enumerate(sents):
            for word_pos, word in enumerate(sentence):
                # Skip non-alphanumeric characters -- TODO: how to deal with this?
                if not word.isalpha():
                    continue
                
                # Keep case mask and lowercase word
                mask = CaseMask(word)
                word = word.lower()
                
                # Check if word is in our dictionary
                if self.dictionary.has_word(word):
                    continue
                
                # Determine context
                word_before = word_after = None
                if word_pos > 0:
                    word_before = sentence[word_pos - 1]
                if word_pos < len(sentence) - 1:
                    word_after = sentence[word_pos + 1]
                context = (word_before, word_after)
                
                for n in range(1, self.max_edit_distance + 1):
                    # Make a list of candidates of distance N
                    # edits_meta -> (word, (meta))
                    word_edits = edits_meta(n, word)
                    
                    # Find words that are in dictionary
                    dict_cands = set([w for w in word_edits if self.dictionary.has_word(w[0])])
                    
                    # Find unknown words from corpus
                    non_words = word_edits - dict_cands
                    #non_words = set([w for w in word_edits if not w[0] in cands])
                    corpus_cands = set([w for w in non_words if self.freq.has_key(w[0])])
                    
                    cands = dict_cands.union(corpus_cands)
                    
                    # Calculate probabilities for each candidate
                    # dict_cands: word -> probability
                    probs = {}
                    for cand in cands:
                        w, edit = cand
                        p = self.prob(w, edit, context)
                        if probs.has_key(w):
                            probs[w] += p
                        else:
                            probs[w] = p
                    
                    # Normalize probabilities
                    if probs:
                        probs = normalize(probs)
                    
                    # Sort candidates
                    candidates = sorted(probs.keys(), 
                                        cmp = lambda c1, c2: cmp(probs[c1], probs[c2]),
                                        reverse = True)
                    
                    if candidates:
                        # Found candidates, don't try any larger edit distance
                        break
                
                
                # Apply case mask to candidates
                candidates = map(mask.apply, candidates)
                
                # Add result
                result = Correction(mask.apply(word), candidates, probs, sentence_pos, word_pos)
                results.append(result)
        
        return results
    
    def correct(self, text):
        """ Correct misspelled word in text.
        Returns the corrected text. """
        raise NotImplementedError