def generate_model(self, train): every_word = set() self.freq = {} condFreq = {} continuation = {} # get ngrams for sent in train: condFreq = AccumCondFreqs(condFreq, CondFreqs(generate_ngrams, [w for w in sent], self.n)) # count the number of times each prefix starts an ngram for p, s in condFreq.iteritems(): self.freq[p] = float(sum(s.values())) # continuation probability counts for w,c in s.iteritems(): if w not in continuation: continuation[w] = set() continuation[w].add(p) # now calculate the model parameters unique_ngram_starts = float(len(condFreq)) self.full_discount = log(self.discount/reduce(add, self.freq.values())) for p, s in condFreq.iteritems(): self.kn_ngram_prob[p] = {} interpolation_weight = self.discount*(float(len(s)))/self.freq[p] for w,c in s.iteritems(): initial_term_count = float(len(continuation[w])) self.kn_ngram_prob[p][w] = log(max(c - self.discount, 0.0)/self.freq[p] + \ (interpolation_weight * initial_term_count)/unique_ngram_starts) SmoothedModel.generate_model(self, train)
def evaluate(self, sentence): ''' evaluate a tokenized sentence probability using the passed-in ngram model ''' SmoothedModel.evaluate(self, sentence) probs, unseen_prob = self.model lp = 1.0 cf = CondFreqs(generate_ngrams, sentence, self.n) for prefix, suffix in cf.iteritems(): if prefix not in probs: lp += log(unseen_prob) else: for term, count in suffix.iteritems(): N = len(probs[prefix]) if term in probs[prefix]: lp += count * log(probs[prefix][term], 2) else: lp += log(unseen_prob) return lp
def evaluate(self, sentence): ''' evaluate a tokenized sentence probability using the passed-in ngram model ''' SmoothedModel.evaluate(self, sentence) probs,V = self.model lp = 1.0 cf = CondFreqs(generate_ngrams, sentence, self.n) for prefix,suffix in cf.iteritems(): if prefix not in probs: lp += log(1.0/(1.0+V), 2) else: for term,count in suffix.iteritems(): N = len(probs[prefix]) if term in probs[prefix]: lp += count * log(probs[prefix][term], 2) else: lp += count * log((1.0/(V+N)), 2) return lp
def generate_model(self, train): ''' given a list of lists of tokenized sentences, generate and store a model corresponding to this type of smoothing. >>> ''' cacc = {} for line in train: cacc = AccumCondFreqs( cacc, CondFreqs(generate_ngrams, [t for t in line], self.n)) fof = [(i, float(len(n))) for (i, n) in freqOfFreq(cacc)] self.model = self.__model_probs( smoothed_counts(cacc, linear_regression(fof, log))) SmoothedModel.generate_model(self, train)
def bigram_freqs(train): ''' given a list of tokenized sentences, return bigram frequencies for this model >>> from nltk.data import load >>> sent_seperator = load('tokenizers/punkt/english.pickle') >>> from nltk.corpus import gutenberg as g >>> sents = sent_seperator.tokenize(g.raw('carroll-alice.txt')) >>> from ngram_helpers import preprocess, tokenize >>> data_gen = [tokenize(preprocess(sent)) for sent in sents] >>> bigram_freqs(data_gen).items()[:5] [('secondly', {'because': 2.0}), ('pardon', {'said': 1.0, '</s>': 4.0, 'your': 1.0}), ('saves', {'a': 1.0}), ('knelt', {'down': 1.0}), ('four', {'feet': 1.0, 'hours': 2.0, 'thousand': 1.0, 'inches': 1.0, 'times': 3.0})] >>> ''' cond_freqs = {} for sent in train: cond_freqs = AccumCondFreqs(cond_freqs, CondFreqs(generate_ngrams, sent, 2)) return cond_freqs
def generate_model(self, train): ''' given a list of lists of tokenized sentences, generate and store a model corresponding to this type of smoothing. >>> from nltk.data import load >>> stok = load('tokenizers/punkt/english.pickle') >>> from nltk.corpus import gutenberg as g >>> from ngram_helpers import * >>> train = [tokenize(preprocess(sent)) for sent in stok.tokenize(g.raw('austen-emma.txt'))] >>> from additive_smoothing import AdditiveSmoothing >>> a_s = AdditiveSmoothing() >>> a_s.generate_model(train) >>> a_s.model[0].items()[:2] [('blessed her', {'before': 0.00027991602519244227}), ('long understood', {'me': 0.00027987685418415898, 'you': 0.00027987685418415898})] >>> ''' cacc = {} for line in train: cacc = AccumCondFreqs( cacc, CondFreqs(generate_ngrams, [t for t in line], self.n)) self.model = self.__smoothed_probs(cacc) SmoothedModel.generate_model(self, train)