Esempio n. 1
0
 def generate_model(self, train):
   every_word = set()
   self.freq = {}
   condFreq = {}
   continuation = {}
   # get ngrams
   for sent in train:
     condFreq = AccumCondFreqs(condFreq, CondFreqs(generate_ngrams, [w for w in sent], self.n))
   # count the number of times each prefix starts an ngram
   for p, s in condFreq.iteritems():
     self.freq[p] = float(sum(s.values()))
     # continuation probability counts
     for w,c in s.iteritems():
       if w not in continuation:
         continuation[w] = set()
       continuation[w].add(p)
   # now calculate the model parameters
   unique_ngram_starts = float(len(condFreq))
   self.full_discount = log(self.discount/reduce(add, self.freq.values()))
   for p, s in condFreq.iteritems():
     self.kn_ngram_prob[p] = {}
     interpolation_weight = self.discount*(float(len(s)))/self.freq[p]
     for w,c in s.iteritems():
       initial_term_count = float(len(continuation[w]))
       self.kn_ngram_prob[p][w] = log(max(c - self.discount, 0.0)/self.freq[p] + \
         (interpolation_weight * initial_term_count)/unique_ngram_starts) 
   SmoothedModel.generate_model(self, train)
Esempio n. 2
0
 def evaluate(self, sentence):
     ''' evaluate a tokenized sentence probability using the passed-in
 ngram model
 '''
     SmoothedModel.evaluate(self, sentence)
     probs, unseen_prob = self.model
     lp = 1.0
     cf = CondFreqs(generate_ngrams, sentence, self.n)
     for prefix, suffix in cf.iteritems():
         if prefix not in probs:
             lp += log(unseen_prob)
         else:
             for term, count in suffix.iteritems():
                 N = len(probs[prefix])
                 if term in probs[prefix]:
                     lp += count * log(probs[prefix][term], 2)
                 else:
                     lp += log(unseen_prob)
     return lp
 def evaluate(self, sentence):
   ''' evaluate a tokenized sentence probability using the passed-in
   ngram model
   '''
   SmoothedModel.evaluate(self, sentence)
   probs,V = self.model
   lp = 1.0
   cf =  CondFreqs(generate_ngrams, sentence, self.n)
   for prefix,suffix in cf.iteritems():
     if prefix not in probs:
       lp += log(1.0/(1.0+V), 2)
     else:
       for term,count in suffix.iteritems():
         N = len(probs[prefix])
         if term in probs[prefix]:
           lp += count * log(probs[prefix][term], 2)
         else:
           lp += count * log((1.0/(V+N)), 2)
   return lp
Esempio n. 4
0
 def generate_model(self, train):
     ''' given a list of lists of tokenized sentences, generate and store 
 a model corresponding to this type of smoothing.
 >>>
 '''
     cacc = {}
     for line in train:
         cacc = AccumCondFreqs(
             cacc, CondFreqs(generate_ngrams, [t for t in line], self.n))
     fof = [(i, float(len(n))) for (i, n) in freqOfFreq(cacc)]
     self.model = self.__model_probs(
         smoothed_counts(cacc, linear_regression(fof, log)))
     SmoothedModel.generate_model(self, train)
Esempio n. 5
0
def bigram_freqs(train):
  ''' given a list of tokenized sentences, return 
  bigram frequencies for this model
  >>> from nltk.data import load
  >>> sent_seperator = load('tokenizers/punkt/english.pickle')
  >>> from nltk.corpus import gutenberg as g
  >>> sents = sent_seperator.tokenize(g.raw('carroll-alice.txt'))
  >>> from ngram_helpers import preprocess, tokenize
  >>> data_gen = [tokenize(preprocess(sent)) for sent in sents]
  >>> bigram_freqs(data_gen).items()[:5]
  [('secondly', {'because': 2.0}), ('pardon', {'said': 1.0, '</s>': 4.0, 'your': 1.0}), ('saves', {'a': 1.0}), ('knelt', {'down': 1.0}), ('four', {'feet': 1.0, 'hours': 2.0, 'thousand': 1.0, 'inches': 1.0, 'times': 3.0})]
  >>>
  '''
  cond_freqs = {}
  for sent in train:
    cond_freqs = AccumCondFreqs(cond_freqs, CondFreqs(generate_ngrams, sent, 2))
  return cond_freqs
 def generate_model(self, train):
     ''' given a list of lists of tokenized sentences, generate and store 
 a model corresponding to this type of smoothing.
 >>> from nltk.data import load
 >>> stok = load('tokenizers/punkt/english.pickle')
 >>> from nltk.corpus import gutenberg as g
 >>> from ngram_helpers import *
 >>> train = [tokenize(preprocess(sent)) for sent in stok.tokenize(g.raw('austen-emma.txt'))]
 >>> from additive_smoothing import AdditiveSmoothing
 >>> a_s = AdditiveSmoothing()
 >>> a_s.generate_model(train)
 >>> a_s.model[0].items()[:2]
 [('blessed her', {'before': 0.00027991602519244227}), ('long understood', {'me': 0.00027987685418415898, 'you': 0.00027987685418415898})]
 >>>
 '''
     cacc = {}
     for line in train:
         cacc = AccumCondFreqs(
             cacc, CondFreqs(generate_ngrams, [t for t in line], self.n))
     self.model = self.__smoothed_probs(cacc)
     SmoothedModel.generate_model(self, train)