Esempio n. 1
0
 def generate_model(self, train):
   every_word = set()
   self.freq = {}
   condFreq = {}
   continuation = {}
   # get ngrams
   for sent in train:
     condFreq = AccumCondFreqs(condFreq, CondFreqs(generate_ngrams, [w for w in sent], self.n))
   # count the number of times each prefix starts an ngram
   for p, s in condFreq.iteritems():
     self.freq[p] = float(sum(s.values()))
     # continuation probability counts
     for w,c in s.iteritems():
       if w not in continuation:
         continuation[w] = set()
       continuation[w].add(p)
   # now calculate the model parameters
   unique_ngram_starts = float(len(condFreq))
   self.full_discount = log(self.discount/reduce(add, self.freq.values()))
   for p, s in condFreq.iteritems():
     self.kn_ngram_prob[p] = {}
     interpolation_weight = self.discount*(float(len(s)))/self.freq[p]
     for w,c in s.iteritems():
       initial_term_count = float(len(continuation[w]))
       self.kn_ngram_prob[p][w] = log(max(c - self.discount, 0.0)/self.freq[p] + \
         (interpolation_weight * initial_term_count)/unique_ngram_starts) 
   SmoothedModel.generate_model(self, train)
Esempio n. 2
0
 def generate_model(self, train):
     every_word = set()
     self.freq = {}
     condFreq = {}
     continuation = {}
     # get ngrams
     for sent in train:
         condFreq = AccumCondFreqs(condFreq, CondFreqs(generate_ngrams, [w for w in sent], self.n))
     # count the number of times each prefix starts an ngram
     for p, s in condFreq.iteritems():
         self.freq[p] = float(sum(s.values()))
         # continuation probability counts
         for w, c in s.iteritems():
             if w not in continuation:
                 continuation[w] = set()
             continuation[w].add(p)
     # now calculate the model parameters
     unique_ngram_starts = float(len(condFreq))
     self.full_discount = log(self.discount / reduce(add, self.freq.values()))
     for p, s in condFreq.iteritems():
         self.kn_ngram_prob[p] = {}
         interpolation_weight = self.discount * (float(len(s))) / self.freq[p]
         for w, c in s.iteritems():
             initial_term_count = float(len(continuation[w]))
             self.kn_ngram_prob[p][w] = log(
                 max(c - self.discount, 0.0) / self.freq[p]
                 + (interpolation_weight * initial_term_count) / unique_ngram_starts
             )
     SmoothedModel.generate_model(self, train)
Esempio n. 3
0
 def __init__(self, n=3):
   '''initialize with the ngram arity for the model we are building
   '''
   self.n = n  #bigrams?  trigrams? other?
   self.model = () # a tuple to store the model
   self.k = 5
   SmoothedModel.__init__(self)
Esempio n. 4
0
 def __init__(self, n=2):
     """Initialize your data structures in the constructor."""
     self.kn_ngram_prob = {}
     self.discount = 0.75
     self.full_discount = 0.0
     self.continuation_prob = {}
     self.n = n  # bigrams?  trigrams? other?
     SmoothedModel.__init__(self)
Esempio n. 5
0
 def __init__(self, n=2):
   """Initialize your data structures in the constructor."""
   self.kn_ngram_prob = {}
   self.discount = 0.75
   self.full_discount = 0.0
   self.continuation_prob = {}
   self.n = n  #bigrams?  trigrams? other?
   SmoothedModel.__init__(self)
Esempio n. 6
0
 def generate_model(self, train):
     """ given a list of lists of tokenized sentences, generate and store 
 a model corresponding to this type of smoothing.
 >>>
 """
     cacc = {}
     for line in train:
         cacc = AccumCondFreqs(cacc, CondFreqs(generate_ngrams, [t for t in line], self.n))
     fof = [(i, float(len(n))) for (i, n) in freqOfFreq(cacc)]
     self.model = model_probs(smoothed_counts(cacc, linear_regression(fof, log)))
     SmoothedModel.generate_model(self, train)
 def evaluate(self, sentence):
   ''' evaluate a tokenized sentence probability using the passed-in
   ngram model
   '''
   SmoothedModel.evaluate(self, sentence)
   probs,V = self.model
   lp = 1.0
   cf =  CondFreqs(generate_ngrams, sentence, self.n)
   for prefix,suffix in cf.iteritems():
     if prefix not in probs:
       lp += log(1.0/(1.0+V), 2)
     else:
       for term,count in suffix.iteritems():
         N = len(probs[prefix])
         if term in probs[prefix]:
           lp += count * log(probs[prefix][term], 2)
         else:
           lp += count * log((1.0/(V+N)), 2)
   return lp
 def generate_model(self, train):
   ''' given a list of lists of tokenized sentences, generate and store 
   a model corresponding to this type of smoothing.
   >>> from nltk.data import load
   >>> stok = load('tokenizers/punkt/english.pickle')
   >>> from nltk.corpus import gutenberg as g
   >>> from ngram_helpers import *
   >>> train = [tokenize(preprocess(sent)) for sent in stok.tokenize(g.raw('austen-emma.txt'))]
   >>> from additive_smoothing import AdditiveSmoothing
   >>> a_s = AdditiveSmoothing()
   >>> a_s.generate_model(train)
   >>> a_s.model[0].items()[:2]
   [('blessed her', {'before': 0.00027991602519244227}), ('long understood', {'me': 0.00027987685418415898, 'you': 0.00027987685418415898})]
   >>>
   '''
   cacc = {}
   for line in train:
     cacc = AccumCondFreqs(cacc, CondFreqs(generate_ngrams, [t for t in line], self.n))
   self.model = self.__smoothed_probs(cacc)
   SmoothedModel.generate_model(self, train)
 def generate_model(self, train):
     ''' given a list of lists of tokenized sentences, generate and store 
 a model corresponding to this type of smoothing.
 >>> from nltk.data import load
 >>> stok = load('tokenizers/punkt/english.pickle')
 >>> from nltk.corpus import gutenberg as g
 >>> from ngram_helpers import *
 >>> train = [tokenize(preprocess(sent)) for sent in stok.tokenize(g.raw('austen-emma.txt'))]
 >>> from additive_smoothing import AdditiveSmoothing
 >>> a_s = AdditiveSmoothing()
 >>> a_s.generate_model(train)
 >>> a_s.model[0].items()[:2]
 [('blessed her', {'before': 0.00027991602519244227}), ('long understood', {'me': 0.00027987685418415898, 'you': 0.00027987685418415898})]
 >>>
 '''
     cacc = {}
     for line in train:
         cacc = AccumCondFreqs(
             cacc, CondFreqs(generate_ngrams, [t for t in line], self.n))
     self.model = self.__smoothed_probs(cacc)
     SmoothedModel.generate_model(self, train)
Esempio n. 10
0
  def generate_model(self, train):
    ''' given a list of lists of tokenized sentences, generate and store 
    a model corresponding to this type of smoothing.
    '''
    # get the backoff frequencies
    freq = {}
    ncf = {}
    smoothedf = {}
    for i in xrange(1,self.n+1):
      ncf[i] = {}
    for line in train:
      ncf[1] = AccumFreqs(ncf[1], Freqs(generate_ngrams, [t for t in line], 1))
      for i in xrange(2,n+1):
        #get ngram frequencies up to n
        ncf[i] = AccumCondFreqs(ncf[i], CondFreqs(generate_ngrams, [t for t in line], i))
    # get the smoothed counts for each ngram
    smoothedf[1] = (ncf[1], sum(ncf[1].values()))
    for i  xrange(2,self.n+1):
      fof = fof = [(i, float(len(n))) for (i, n) in freqOfFreq(ncf[i])]
      smoothedf[i] = smoothed_counts(smoothedf[i], linear_regression(fof, log))

    # now get the model probabilities for each ngram layer
    modelp = {}
    for i,m in smoothedf.iteritems():
      probs = {}
      if i is 1:
        for w,c in m[0].iteritems():
          probs[w] = float(c)/m[1]
      for p, s in m[0].iteritems():
        probs[p] = {}
        n = float(len(s))
        for w,c in s:
          probs[p][w] = c/n
      modelp[i] = probs
      self.model = (modelp, smoothedf)
    return self.model
    SmoothedModel.generate_model(self, train)
Esempio n. 11
0
 def __init(self, n=3):
   self.n = n
   self.model = ()
   SmoothedModel.__init__(self)