def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data. """ trainPath = '../data/micro/en_US/' trainingCorpus = CapstoneCorpus(trainPath) #print str(trainingCorpus) sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd" tokens = Tokenize(sent) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) print "VocSize= " + str(len(uniformLM.words)) print sent print tokens print "uniform score=" + str(uniformLM.score(tokens)) print 'Unigram Language Model: ' unigramLM = UnigramLanguageModel(trainingCorpus) print "VocSize= " + str(len(unigramLM.unigramCounts)) print "unigram score=" + str(unigramLM.score(tokens)) print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramLM.save("smallUnigram.LM") print "VocSize= " + str(len(laplaceUnigramLM.f1)) print "unigram score=" + str(laplaceUnigramLM.score(tokens)) print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramLM.save("smallBigram.LM") print "bigram score=" + str(laplaceBigramLM.score(tokens)) print 'Laplace Ngram Language Model: N=2' laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus,2) laplaceN2gramLM.save("smallN2gram.LM") print "N=2gram score=" + str(laplaceN2gramLM.score(tokens)) print 'Laplace Ngram Language Model: N=3' laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus,3) laplaceN3gramLM.save("smallN3gram.LM") print "N=3gram score=" + str(laplaceN2gramLM.score(tokens)) print 'Custom Language Model: ' customLM = CustomLanguageModel(trainingCorpus,N=2) print "Custom LM score=" + str(customLM.score(tokens))
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data. """ trainPath = '../data/micro/en_US/' trainingCorpus = CapstoneCorpus(trainPath) #print str(trainingCorpus) sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd" tokens = Tokenize(sent) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) print "VocSize= " + str(len(uniformLM.words)) print sent print tokens print "uniform score=" + str(uniformLM.score(tokens)) print 'Unigram Language Model: ' unigramLM = UnigramLanguageModel(trainingCorpus) print "VocSize= " + str(len(unigramLM.unigramCounts)) print "unigram score=" + str(unigramLM.score(tokens)) print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramLM.save("smallUnigram.LM") print "VocSize= " + str(len(laplaceUnigramLM.f1)) print "unigram score=" + str(laplaceUnigramLM.score(tokens)) print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramLM.save("smallBigram.LM") print "bigram score=" + str(laplaceBigramLM.score(tokens)) print 'Laplace Ngram Language Model: N=2' laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus, 2) laplaceN2gramLM.save("smallN2gram.LM") print "N=2gram score=" + str(laplaceN2gramLM.score(tokens)) print 'Laplace Ngram Language Model: N=3' laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus, 3) laplaceN3gramLM.save("smallN3gram.LM") print "N=3gram score=" + str(laplaceN2gramLM.score(tokens)) print 'Custom Language Model: ' customLM = CustomLanguageModel(trainingCorpus, N=2) print "Custom LM score=" + str(customLM.score(tokens))
def output(self, partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') if partId in [1, 2]: editModel = EditModel('../data/count_1edit.txt', trainCorpus) return json.dumps([[(e.editedWord, e.rule()) for e in editModel.edits(line.strip())] for line in ch_aux.split("\n")]) else: testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId in [3, 4]: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId in [5, 6]: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId in [7, 8]: lm = StupidBackoffLanguageModel(trainCorpus) elif partId in [9, 10]: lm = CustomLanguageModel(trainCorpus) else: print 'Unknown partId: " + partId' return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = '../data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = '../data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) print 'Unigram Language Model: ' unigramLM = UnigramLanguageModel(trainingCorpus) unigramSpell = SpellCorrect(unigramLM, trainingCorpus) unigramOutcome = unigramSpell.evaluate(devCorpus) print str(unigramOutcome) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) uniformSpell = SpellCorrect(uniformLM, trainingCorpus) uniformOutcome = uniformSpell.evaluate(devCorpus) print str(uniformOutcome) print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus) laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus) print str(laplaceUnigramOutcome) print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus) laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus) print str(laplaceBigramOutcome) print 'Stupid Backoff Language Model: ' sbLM = StupidBackoffLanguageModel(trainingCorpus) sbSpell = SpellCorrect(sbLM, trainingCorpus) sbOutcome = sbSpell.evaluate(devCorpus) print str(sbOutcome) print 'Custom Language Model (based on LaplaceBigramLanguageModel): ' customLM = CustomLanguageModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print str(customOutcome) print 'Custom Language Model2 (based on StupidBackoffLanguageModel): ' customLM2 = CustomLanguageModel2(trainingCorpus) customSpell2 = SpellCorrect(customLM2, trainingCorpus) customOutcome2 = customSpell2.evaluate(devCorpus) print str(customOutcome2)
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = '../data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = '../data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) uniformSpell = SpellCorrect(uniformLM, trainingCorpus) uniformOutcome = uniformSpell.evaluate(devCorpus) print str(uniformOutcome), '\n' print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus) laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus) print str(laplaceUnigramOutcome), '\n' #It has (accuracy: 0.012739) because of the small corpus (I think ^_^) print 'Good-Turing Unigram Language Model: ' GoodTuringLM = GoodTuringUnigramLanguageModel(trainingCorpus) GoodTuringSpell = SpellCorrect(GoodTuringLM, trainingCorpus) GoodTuringOutcome = GoodTuringSpell.evaluate(devCorpus) print str(GoodTuringOutcome), '\n' #This model takes some time, about (70) seconds print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus) laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus) print str(laplaceBigramOutcome), '\n' #This model takes some time, about (70) seconds print 'Stupid Backoff Language Model: ' sbLM = StupidBackoffLanguageModel(trainingCorpus) sbSpell = SpellCorrect(sbLM, trainingCorpus) sbOutcome = sbSpell.evaluate(devCorpus) print str(sbOutcome), '\n' #This model takes some time, about (70) seconds print 'Custom Language Model: ' customLM = CustomLanguageModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print str(customOutcome), '\n'
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = '../data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = '../data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) # print('Uniform Language Model: ') # uniformLM = UniformLanguageModel(trainingCorpus) # uniformSpell = SpellCorrect(uniformLM, trainingCorpus) # uniformOutcome = uniformSpell.evaluate(devCorpus) # print(str(uniformOutcome)) print('\nLaplace Unigram Language Model: ') laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus) laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus) print(str(laplaceUnigramOutcome)) print('\nLaplace Bigram Language Model: ') laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus) laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus) print(str(laplaceBigramOutcome)) # print('\nStupid Backoff Language Model: ') # sbLM = StupidBackoffLanguageModel(trainingCorpus) # sbSpell = SpellCorrect(sbLM, trainingCorpus) # sbOutcome = sbSpell.evaluate(devCorpus) # print(str(sbOutcome)) # print('\nCustom Language Model: ') customLM = CustomLanguageModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print(str(customOutcome))
def output(partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId == 1 or partId == 2: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId == 3 or partId == 4: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId == 5 or partId == 6: lm = StupidBackoffLanguageModel(trainCorpus) elif partId == 7 or partId == 8: lm = CustomLanguageModel(trainCorpus) else: print('Unknown partId: " + partId') return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
# bad_model = CustomLanguageModel(bad_train, n=n, coef=1) # # print('Evaluation of the models...') # true = 0 # for i in range(len(good_test)): # print(str(i) + '/' + str(len(good_test))) # if good_model.score(good_test[i]) > bad_model.score(good_test[i]): # true += 1 # if good_model.score(bad_test[i]) < bad_model.score(bad_test[i]): # true += 1 # # print('accuracy ' + str(n) + '-gram, coef=1: ' + str(true / (len(good_test) * 2))) # ################################################# # coef = 0.4 print('Creation of the models...') good_model = CustomLanguageModel(good_train, n=n, coef=0.4) bad_model = CustomLanguageModel(bad_train, n=n, coef=0.4) print('Evaluation of the models...') true = 0 t = tqdm.tqdm(total=len(good_test)) for i in range(len(good_test)): t.update() if good_model.score(good_test[i]) > bad_model.score(good_test[i]): true += 1 if good_model.score(bad_test[i]) < bad_model.score(bad_test[i]): true += 1 t.close() print('accuracy ' + str(n) + '-gram, coef=0.4: ' + str(true / (len(good_test) * 2)))