Ejemplo n.º 1
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data.
    """
  
    trainPath = '../data/micro/en_US/'
    trainingCorpus = CapstoneCorpus(trainPath)
    #print str(trainingCorpus)
  
    sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
    tokens = Tokenize(sent)
  
    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(uniformLM.words))
    print sent
    print tokens
    print "uniform score=" + str(uniformLM.score(tokens))
  
    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(unigramLM.unigramCounts))
    print "unigram score=" + str(unigramLM.score(tokens))
  
    print 'Laplace Unigram Language Model: ' 
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramLM.save("smallUnigram.LM")
    print "VocSize= " + str(len(laplaceUnigramLM.f1))
    print "unigram score=" + str(laplaceUnigramLM.score(tokens))
  
    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramLM.save("smallBigram.LM")
    print "bigram score=" + str(laplaceBigramLM.score(tokens))
  
    print 'Laplace Ngram Language Model: N=2'
    laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus,2)
    laplaceN2gramLM.save("smallN2gram.LM")
    print "N=2gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=3'
    laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus,3)
    laplaceN3gramLM.save("smallN3gram.LM")
    print "N=3gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus,N=2)
    print "Custom LM score=" + str(customLM.score(tokens))
Ejemplo n.º 2
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data.
    """

    trainPath = '../data/micro/en_US/'
    trainingCorpus = CapstoneCorpus(trainPath)
    #print str(trainingCorpus)

    sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
    tokens = Tokenize(sent)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(uniformLM.words))
    print sent
    print tokens
    print "uniform score=" + str(uniformLM.score(tokens))

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(unigramLM.unigramCounts))
    print "unigram score=" + str(unigramLM.score(tokens))

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramLM.save("smallUnigram.LM")
    print "VocSize= " + str(len(laplaceUnigramLM.f1))
    print "unigram score=" + str(laplaceUnigramLM.score(tokens))

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramLM.save("smallBigram.LM")
    print "bigram score=" + str(laplaceBigramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=2'
    laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus, 2)
    laplaceN2gramLM.save("smallN2gram.LM")
    print "N=2gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=3'
    laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus, 3)
    laplaceN3gramLM.save("smallN3gram.LM")
    print "N=3gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus, N=2)
    print "Custom LM score=" + str(customLM.score(tokens))
Ejemplo n.º 3
0
    def output(self, partId, ch_aux):
        """Uses the student code to compute the output for test cases."""
        trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')

        if partId in [1, 2]:
            editModel = EditModel('../data/count_1edit.txt', trainCorpus)
            return json.dumps([[(e.editedWord, e.rule())
                                for e in editModel.edits(line.strip())]
                               for line in ch_aux.split("\n")])
        else:
            testCorpus = HolbrookCorpus()
            testCorpus.slurpString(ch_aux)
            lm = None
            if partId in [3, 4]:
                lm = LaplaceUnigramLanguageModel(trainCorpus)
            elif partId in [5, 6]:
                lm = LaplaceBigramLanguageModel(trainCorpus)
            elif partId in [7, 8]:
                lm = StupidBackoffLanguageModel(trainCorpus)
            elif partId in [9, 10]:
                lm = CustomLanguageModel(trainCorpus)
            else:
                print 'Unknown partId: " + partId'
                return None

            speller = SpellCorrect(lm, trainCorpus)
            output = speller.correctCorpus(testCorpus)
            # put in the part ID as well
            output = '[["%d"],%s' % (partId, output[1:])
            return output
Ejemplo n.º 4
0
 def __init__(self, corpus):
     """Initialize your data structures in the constructor."""
     # TODO your code here
     self.bigramCount = collections.defaultdict(lambda: 0)
     self.uniGram = LaplaceUnigramLanguageModel(corpus)
     self.train(corpus)
     self.vocab = len(self.bigramCount.keys())
 def __init__(self, corpus):
   """Initialize your data structures in the constructor."""
   # TODO your code here
   self.BLM = LaplaceBigramLanguageModel(corpus)
   self.ULM = LaplaceUnigramLanguageModel(corpus)
   self.discount = 0.75 
   self.ends_with = dict() #number of bigrams that ends with a particular word
   self.train(corpus)
Ejemplo n.º 6
0
 def __init__(self, corpus):
     """Initialize your data structures in the constructor."""
     # TODO your code here
     unigramModel = LaplaceUnigramLanguageModel(corpus)
     self.total = unigramModel.total
     bigramModel = LaplaceBigramLanguageModel(corpus)
     self.UnigramCounts = unigramModel.LaplaceUnigramCounts
     self.BigramCounts = bigramModel.LaplaceBigramCounts
Ejemplo n.º 7
0
 def __init__(self, corpus):
   """Initialize your data structures in the constructor."""
   # TODO your code here
   model = LaplaceUnigramLanguageModel(corpus)
   self.Unicounts = model.LaplaceUnigramCounts
   self.newCounts = copy.copy(self.Unicounts)
   self.N = model.total
   self.N_1 = self.Unicounts.values().count(1)
   self.train()
Ejemplo n.º 8
0
 def __init__(self, corpus):
   """Initialize your data structures in the constructor."""
   # TODO your code here
   uniModel = LaplaceUnigramLanguageModel(corpus)
   self.LaplaceUnigramCounts = uniModel.LaplaceUnigramCounts
   self.LaplaceBigramCounts = Counter()
   #The 'total' varaible is totally useless in this file, but i'm going
   #to use it in other files.
   self.total = uniModel.total
   self.train(corpus)
class StupidBackoffLanguageModel:

  def __init__(self, corpus):
    """Initialize your data structures in the constructor."""
    # TODO your code here
    self.ULM = LaplaceUnigramLanguageModel(corpus)
    self.BLM = LaplaceBigramLanguageModel(corpus)
    self.train(corpus)

  def train(self, corpus):
    """ Takes a corpus and trains your language model. 
        Compute any counts or other corpus statistics in this function.
    """  
    # TODO your code here
    self.ULM.train(corpus)
    self.BLM.train(corpus)
    pass

  def score(self, sentence):
    """ Takes a list of strings as argument and returns the log-probability of the 
        sentence using your language model. Use whatever data you computed in train() here.
    """
    result = 0.0
    for i in range(len(sentence)-1):
      first = sentence[i]
      second = sentence[i+1]
      if(first in self.BLM.bigram):
        if(second in self.BLM.bigram[first]):
          #do not use backoff
          numer = self.BLM.bigram[first].get(second)
          denom = sum( self.BLM.bigram[first].values())
          result += math.log(float(numer)/denom )
        else:
          #use backoff
          result += self.ULM.score(second)
      else:
        #the first word does not appear
        result += self.ULM.score(first)
    # TODO your code here
    return result
Ejemplo n.º 10
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
    unigramOutcome = unigramSpell.evaluate(devCorpus)
    print str(unigramOutcome)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome)

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome)

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome)

    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome)

    print 'Custom Language Model (based on LaplaceBigramLanguageModel): '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome)

    print 'Custom Language Model2 (based on StupidBackoffLanguageModel): '
    customLM2 = CustomLanguageModel2(trainingCorpus)
    customSpell2 = SpellCorrect(customLM2, trainingCorpus)
    customOutcome2 = customSpell2.evaluate(devCorpus)
    print str(customOutcome2)
Ejemplo n.º 11
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome), '\n'

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome), '\n'

    #It has (accuracy: 0.012739) because of the small corpus (I think ^_^)
    print 'Good-Turing Unigram Language Model: '
    GoodTuringLM = GoodTuringUnigramLanguageModel(trainingCorpus)
    GoodTuringSpell = SpellCorrect(GoodTuringLM, trainingCorpus)
    GoodTuringOutcome = GoodTuringSpell.evaluate(devCorpus)
    print str(GoodTuringOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome), '\n'
Ejemplo n.º 12
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    #  print('Uniform Language Model: ')
    #  uniformLM = UniformLanguageModel(trainingCorpus)
    #  uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    #  uniformOutcome = uniformSpell.evaluate(devCorpus)
    #  print(str(uniformOutcome))

    print('\nLaplace Unigram Language Model: ')
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print(str(laplaceUnigramOutcome))

    print('\nLaplace Bigram Language Model: ')
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print(str(laplaceBigramOutcome))

    #  print('\nStupid Backoff Language Model: ')
    #  sbLM = StupidBackoffLanguageModel(trainingCorpus)
    #  sbSpell = SpellCorrect(sbLM, trainingCorpus)
    #  sbOutcome = sbSpell.evaluate(devCorpus)
    #  print(str(sbOutcome))
    #
    print('\nCustom Language Model: ')
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print(str(customOutcome))
Ejemplo n.º 13
0
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""

    trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')
    testCorpus = HolbrookCorpus()
    testCorpus.slurpString(ch_aux)
    lm = None
    if partId == 1 or partId == 2:
        lm = LaplaceUnigramLanguageModel(trainCorpus)
    elif partId == 3 or partId == 4:
        lm = LaplaceBigramLanguageModel(trainCorpus)
    elif partId == 5 or partId == 6:
        lm = StupidBackoffLanguageModel(trainCorpus)
    elif partId == 7 or partId == 8:
        lm = CustomLanguageModel(trainCorpus)
    else:
        print('Unknown partId: " + partId')
        return None

    speller = SpellCorrect(lm, trainCorpus)
    output = speller.correctCorpus(testCorpus)
    # put in the part ID as well
    output = '[["%d"],%s' % (partId, output[1:])
    return output
Ejemplo n.º 14
0
class CustomLanguageModel:
#Bigram model with kneser-ney smoothing
  def __init__(self, corpus):
    """Initialize your data structures in the constructor."""
    # TODO your code here
    self.BLM = LaplaceBigramLanguageModel(corpus)
    self.ULM = LaplaceUnigramLanguageModel(corpus)
    self.discount = 0.75 
    self.ends_with = dict() #number of bigrams that ends with a particular word
    self.train(corpus)

  def train(self, corpus):
    """ Takes a corpus and trains your language model. 
        Compute any counts or other corpus statistics in this function.
    """  
    # TODO your code here
    self.BLM.train(corpus)
    self.ULM.train(corpus)
    for word in self.ULM.unigram.keys():
      count = 0
      for start_word in self.BLM.bigram.keys():
        if word in self.BLM.bigram[start_word]:
          count+=1 
      self.ends_with[word] = count
  def normalize(self, word):
    count_word = self.ULM.unigram.get(word,0)+1
    num_type_following_word = len(self.BLM.bigram.get(word,{})) 
    return (self.discount/count_word) * num_type_following_word
  
  def p_continuation(self,word):
    #word_hash = self.BLM.bigram.get(word,{})
    
    #number of word types followed by word
    return float(self.ends_with.get(word,0))/ self.BLM.num_types
  
  def score(self, sentence):
    """ Takes a list of strings as argument and returns the log-probability of the 
        sentence using your language model. Use whatever data you computed in train() here.
    """
    # TODO your code here
    result = 0.0
    for i in range(len(sentence)-1):
      first = sentence[i]
      second = sentence[i+1]
      if first in self.BLM.bigram:
        denom = self.ULM.unigram.get(first)
        if second in self.BLM.bigram[first]:
          #the bigram is present
          numer = max(self.BLM.bigram[first].get(second)-self.discount, 0)
        else:
          numer = 0.0
      else:
        #first is not part of any bigram
        numer = 0.0
        denom = 1.0
      
      #numer = max(self.BLM.bigram[first].get(second)-self.discount, 0)
      #denom = self.ULM.unigram.get(first,0)
      l = self.normalize(first) #lambda weight
      pc = self.p_continuation(second) #continuation probability
      prob = (float(numer)/denom) + (pc * l)
      #print prob,numer, denom, pc, l
      if prob == 0:
        result += math.log(10e-15)
      else:
        result += math.log(prob)
    #print result
    return result
 def __init__(self, corpus):
   """Initialize your data structures in the constructor."""
   # TODO your code here
   self.ULM = LaplaceUnigramLanguageModel(corpus)
   self.BLM = LaplaceBigramLanguageModel(corpus)
   self.train(corpus)