class StupidBackoffLanguageModel:

  def __init__(self, corpus):
    """Initialize your data structures in the constructor."""
    # TODO your code here
    self.ULM = LaplaceUnigramLanguageModel(corpus)
    self.BLM = LaplaceBigramLanguageModel(corpus)
    self.train(corpus)

  def train(self, corpus):
    """ Takes a corpus and trains your language model. 
        Compute any counts or other corpus statistics in this function.
    """  
    # TODO your code here
    self.ULM.train(corpus)
    self.BLM.train(corpus)
    pass

  def score(self, sentence):
    """ Takes a list of strings as argument and returns the log-probability of the 
        sentence using your language model. Use whatever data you computed in train() here.
    """
    result = 0.0
    for i in range(len(sentence)-1):
      first = sentence[i]
      second = sentence[i+1]
      if(first in self.BLM.bigram):
        if(second in self.BLM.bigram[first]):
          #do not use backoff
          numer = self.BLM.bigram[first].get(second)
          denom = sum( self.BLM.bigram[first].values())
          result += math.log(float(numer)/denom )
        else:
          #use backoff
          result += self.ULM.score(second)
      else:
        #the first word does not appear
        result += self.ULM.score(first)
    # TODO your code here
    return result
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data.
    """
  
    trainPath = '../data/micro/en_US/'
    trainingCorpus = CapstoneCorpus(trainPath)
    #print str(trainingCorpus)
  
    sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
    tokens = Tokenize(sent)
  
    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(uniformLM.words))
    print sent
    print tokens
    print "uniform score=" + str(uniformLM.score(tokens))
  
    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(unigramLM.unigramCounts))
    print "unigram score=" + str(unigramLM.score(tokens))
  
    print 'Laplace Unigram Language Model: ' 
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramLM.save("smallUnigram.LM")
    print "VocSize= " + str(len(laplaceUnigramLM.f1))
    print "unigram score=" + str(laplaceUnigramLM.score(tokens))
  
    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramLM.save("smallBigram.LM")
    print "bigram score=" + str(laplaceBigramLM.score(tokens))
  
    print 'Laplace Ngram Language Model: N=2'
    laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus,2)
    laplaceN2gramLM.save("smallN2gram.LM")
    print "N=2gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=3'
    laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus,3)
    laplaceN3gramLM.save("smallN3gram.LM")
    print "N=3gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus,N=2)
    print "Custom LM score=" + str(customLM.score(tokens))
Example #3
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data.
    """

    trainPath = '../data/micro/en_US/'
    trainingCorpus = CapstoneCorpus(trainPath)
    #print str(trainingCorpus)

    sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
    tokens = Tokenize(sent)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(uniformLM.words))
    print sent
    print tokens
    print "uniform score=" + str(uniformLM.score(tokens))

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(unigramLM.unigramCounts))
    print "unigram score=" + str(unigramLM.score(tokens))

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramLM.save("smallUnigram.LM")
    print "VocSize= " + str(len(laplaceUnigramLM.f1))
    print "unigram score=" + str(laplaceUnigramLM.score(tokens))

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramLM.save("smallBigram.LM")
    print "bigram score=" + str(laplaceBigramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=2'
    laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus, 2)
    laplaceN2gramLM.save("smallN2gram.LM")
    print "N=2gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=3'
    laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus, 3)
    laplaceN3gramLM.save("smallN3gram.LM")
    print "N=3gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus, N=2)
    print "Custom LM score=" + str(customLM.score(tokens))