def output(self, partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') if partId in [1,2]: editModel = EditModel('../data/count_1edit.txt', trainCorpus) return json.dumps([[(e.editedWord, e.rule()) for e in editModel.edits(line.strip())] for line in ch_aux.split("\n")]) else: testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId in [3,4]: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId in [5,6]: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId in [7,8]: lm = StupidBackoffLanguageModel(trainCorpus) elif partId in [9,10]: lm = CustomLanguageModel(trainCorpus) else: print 'Unknown partId: " + partId' return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
def output(self, partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') if partId in [1, 2]: editModel = EditModel('../data/count_1edit.txt', trainCorpus) return json.dumps([[(e.editedWord, e.rule()) for e in editModel.edits(line.strip())] for line in ch_aux.split("\n")]) else: testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId in [3, 4]: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId in [5, 6]: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId in [7, 8]: lm = StupidBackoffLanguageModel(trainCorpus) elif partId in [9, 10]: lm = CustomLanguageModel(trainCorpus) else: print 'Unknown partId: " + partId' return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = '../data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = '../data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) print 'Unigram Language Model: ' unigramLM = UnigramLanguageModel(trainingCorpus) unigramSpell = SpellCorrect(unigramLM, trainingCorpus) unigramOutcome = unigramSpell.evaluate(devCorpus) print str(unigramOutcome) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) uniformSpell = SpellCorrect(uniformLM, trainingCorpus) uniformOutcome = uniformSpell.evaluate(devCorpus) print str(uniformOutcome) print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus) laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus) print str(laplaceUnigramOutcome) print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus) laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus) print str(laplaceBigramOutcome) print 'Stupid Backoff Language Model: ' sbLM = StupidBackoffLanguageModel(trainingCorpus) sbSpell = SpellCorrect(sbLM, trainingCorpus) sbOutcome = sbSpell.evaluate(devCorpus) print str(sbOutcome) print 'Custom Language Model (based on LaplaceBigramLanguageModel): ' customLM = CustomLanguageModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print str(customOutcome) print 'Custom Language Model2 (based on StupidBackoffLanguageModel): ' customLM2 = CustomLanguageModel2(trainingCorpus) customSpell2 = SpellCorrect(customLM2, trainingCorpus) customOutcome2 = customSpell2.evaluate(devCorpus) print str(customOutcome2)
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = 'data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = 'data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) print('Stupid Backoff Language Model: ') sbLM = StupidBackoffLanguageModel(trainingCorpus) sbSpell = SpellCorrect(sbLM, trainingCorpus) sbOutcome = sbSpell.evaluate(devCorpus) print(str(sbOutcome))
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = '../data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = '../data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) uniformSpell = SpellCorrect(uniformLM, trainingCorpus) uniformOutcome = uniformSpell.evaluate(devCorpus) print str(uniformOutcome), '\n' print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus) laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus) print str(laplaceUnigramOutcome), '\n' #It has (accuracy: 0.012739) because of the small corpus (I think ^_^) print 'Good-Turing Unigram Language Model: ' GoodTuringLM = GoodTuringUnigramLanguageModel(trainingCorpus) GoodTuringSpell = SpellCorrect(GoodTuringLM, trainingCorpus) GoodTuringOutcome = GoodTuringSpell.evaluate(devCorpus) print str(GoodTuringOutcome), '\n' #This model takes some time, about (70) seconds print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus) laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus) print str(laplaceBigramOutcome), '\n' #This model takes some time, about (70) seconds print 'Stupid Backoff Language Model: ' sbLM = StupidBackoffLanguageModel(trainingCorpus) sbSpell = SpellCorrect(sbLM, trainingCorpus) sbOutcome = sbSpell.evaluate(devCorpus) print str(sbOutcome), '\n' #This model takes some time, about (70) seconds print 'Custom Language Model: ' customLM = CustomLanguageModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print str(customOutcome), '\n'
def main(): """Sanity checks the edit model on the word 'hi'.""" trainPath = 'data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) editModel = EditModel("data/count_1edit.txt", trainingCorpus) #These are for testing, you can ignore them DELETE_EDITS = set(['Edit(editedWord=i, rule=<h|<)', 'Edit(editedWord=h, rule=hi|h)']) INSERT_EDITS = set([Edit('ahi','<','<a'),Edit('bhi','<','<b'),Edit('chi','<','<c'),Edit('dhi','<','<d'),Edit('ehi','<','<e'),Edit('fhi','<','<f'),Edit('ghi','<','<g'),Edit('hhi','<','<h'),Edit('ihi','<','<i'),Edit('jhi','<','<j'),Edit('khi','<','<k'),Edit('lhi','<','<l'),Edit('mhi','<','<m'),Edit('nhi','<','<n'),Edit('ohi','<','<o'),Edit('phi','<','<p'),Edit('qhi','<','<q'), Edit('rhi','<','<r'),Edit('shi','<','<s'),Edit('thi','<','<t'),Edit('uhi','<','<u'),Edit('vhi','<','<v'),Edit('whi','<','<w'),Edit('xhi','<','<x'),Edit('yhi','<','<y'),Edit('zhi','<','<z'),Edit('hai','h','ha'),Edit('hbi','h','hb'),Edit('hci','h','hc'),Edit('hdi','h','hd'),Edit('hei','h','he'),Edit('hfi','h','hf'),Edit('hgi','h','hg'),Edit('hhi','h','hh'), Edit('hii','h','hi'),Edit('hji','h','hj'),Edit('hki','h','hk'),Edit('hli','h','hl'),Edit('hmi','h','hm'),Edit('hni','h','hn'),Edit('hoi','h','ho'),Edit('hpi','h','hp'),Edit('hqi','h','hq'),Edit('hri','h','hr'),Edit('hsi','h','hs'),Edit('hti','h','ht'),Edit('hui','h','hu'),Edit('hvi','h','hv'),Edit('hwi','h','hw'),Edit('hxi','h','hx'),Edit('hyi','h','hy'),Edit('hzi','h','hz'), Edit('hia','i','ia'),Edit('hib','i','ib'),Edit('hic','i','ic'),Edit('hid','i','id'),Edit('hie','i','ie'),Edit('hif','i','if'),Edit('hig','i','ig'),Edit('hih','i','ih'),Edit('hii','i','ii'),Edit('hij','i','ij'),Edit('hik','i','ik'),Edit('hil','i','il'),Edit('him','i','im'),Edit('hin','i','in'),Edit('hio','i','io'),Edit('hip','i','ip'),Edit('hiq','i','iq'),Edit('hir','i','ir'), Edit('his','i','is'),Edit('hit','i','it'),Edit('hiu','i','iu'),Edit('hiv','i','iv'),Edit('hiw','i','iw'),Edit('hix','i','ix'),Edit('hiy','i','iy'),Edit('hiz','i','iz')]) TRANPOSE_EDITS = set([Edit('ih','hi','ih')]) REPLACE_EDITS = set([Edit('ai','h','a'),Edit('bi','h','b'),Edit('ci','h','c'),Edit('di','h','d'),Edit('ei','h','e'),Edit('fi','h','f'),Edit('gi','h','g'),Edit('ii','h','i'),Edit('ji','h','j'), Edit('ki','h','k'),Edit('li','h','l'),Edit('mi','h','m'),Edit('ni','h','n'),Edit('oi','h','o'),Edit('pi','h','p'),Edit('qi','h','q'),Edit('ri','h','r'),Edit('si','h','s'),Edit('ti','h','t'), Edit('ui','h','u'),Edit('vi','h','v'),Edit('wi','h','w'),Edit('xi','h','x'),Edit('yi','h','y'),Edit('zi','h','z'),Edit('ha','i','a'),Edit('hb','i','b'),Edit('hc','i','c'),Edit('hd','i','d'),Edit('he','i','e'),Edit('hf','i','f'),Edit('hg','i','g'),Edit('hh','i','h'),Edit('hj','i','j'), Edit('hk','i','k'),Edit('hl','i','l'),Edit('hm','i','m'),Edit('hn','i','n'),Edit('ho','i','o'),Edit('hp','i','p'),Edit('hq','i','q'),Edit('hr','i','r'),Edit('hs','i','s'),Edit('ht','i','t'), Edit('hu','i','u'),Edit('hv','i','v'),Edit('hw','i','w'),Edit('hx','i','x'),Edit('hy','i','y'),Edit('hz','i','z')]) print("***Code Sanity Check***") print("Delete edits for 'hi'") checkOverlap(set(editModel.deleteEdits('hi')), DELETE_EDITS) print("Insert edits for 'hi'") checkOverlap(set(editModel.insertEdits('hi')), INSERT_EDITS) print("Transpose edits for 'hi'") checkOverlap(set(editModel.transposeEdits('hi')), TRANPOSE_EDITS) print("Replace edits for 'hi'") checkOverlap(set(editModel.replaceEdits('hi')), REPLACE_EDITS)
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = '../data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = '../data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) # print('Uniform Language Model: ') # uniformLM = UniformLanguageModel(trainingCorpus) # uniformSpell = SpellCorrect(uniformLM, trainingCorpus) # uniformOutcome = uniformSpell.evaluate(devCorpus) # print(str(uniformOutcome)) print('\nLaplace Unigram Language Model: ') laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus) laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus) print(str(laplaceUnigramOutcome)) print('\nLaplace Bigram Language Model: ') laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus) laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus) print(str(laplaceBigramOutcome)) # print('\nStupid Backoff Language Model: ') # sbLM = StupidBackoffLanguageModel(trainingCorpus) # sbSpell = SpellCorrect(sbLM, trainingCorpus) # sbOutcome = sbSpell.evaluate(devCorpus) # print(str(sbOutcome)) # print('\nCustom Language Model: ') customLM = CustomLanguageModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print(str(customOutcome))
def output(partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId == 1 or partId == 2: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId == 3 or partId == 4: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId == 5 or partId == 6: lm = StupidBackoffLanguageModel(trainCorpus) elif partId == 7 or partId == 8: lm = CustomLanguageModel(trainCorpus) else: print('Unknown partId: " + partId') return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
def output(partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId == 1 or partId == 2: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId == 3 or partId == 4: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId == 5 or partId == 6: lm = StupidBackoffLanguageModel(trainCorpus) elif partId == 7 or partId == 8: lm = CustomLanguageModel(trainCorpus) else: print 'Unknown partId: " + partId' return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
def langModel(): trainPath = "es-en/train/europarl-v7.es-en.en" #'holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) LM = LaplaceBigramLanguageModel(trainingCorpus) return LM
#!/usr/bin/env python # -*- coding: utf-8 -* import numpy import nltk from nltk.tag.stanford import POSTagger from Datum import Datum from Sentence import Sentence from HolbrookCorpus import HolbrookCorpus from StupidBackoffLanguageModel import StupidBackoffLanguageModel ### Test Bigram Backoff Language model eng_corpus = HolbrookCorpus('holbrook-tagged-train.dat') eng_model = StupidBackoffLanguageModel(eng_corpus) sentence = 'what do you want to eat for dinner' print("Score for sentence \"" + sentence + "\": " + str(eng_model.score(sentence.split()))) sentence = 'what do you want to eat for dinner' print("Score for sentence \"" + sentence + "\": " + str(eng_model.score(sentence.split()))) ### Test POS ## Configure this to be your Java directory # nltk.internals.config_java(u"C:/Program Files/Java/jre7/bin/java.exe") # chunk = u"古往今来 , 有 多少 的 成功者 被 人们 赞赏" # text = nltk.word_tokenize(chunk.encode('utf-8')) #st = POSTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar')
def main(): """ Train all the implemented language models and test them on the test data. """ # generate a corpus include a list of sentence where corrected word(misspelled word), including start/stop symbol # example: <s> lucky (luckily) enough it was mostly tinned (tin) food </s> # use try-except to see if file exist or path is right try: f = open("./data/holbrook-tagged-train.dat","r") f.readlines() f.close() except IOError: print "Files not found. Check if in the right directory path!" trainPath = './data/holbrook-tagged-train.dat' trainCorpus = HolbrookCorpus(trainPath) testPath = './data/holbrook-tagged-dev.dat' testCorpus = HolbrookCorpus(testPath) with open('ComparisonLM.log','w') as f: f.write('Comparison of different language models: \n') f.write('\n') print ('Unigram Language Model Evaluation') f.write('Unigram Language Model: \n') unigramLM = UnigramLM(trainCorpus) unigramSpell = SpellCorrection(unigramLM, trainCorpus) unigramOutput,t = unigramSpell.evaluation(testCorpus) f.write(str(unigramOutput)) f.write('\nTime to run (seconds): ') f.write(str(t)+'\n') f.write('\n') print ('Laplace Unigram Language Model Evaluation') f.write('Laplace Unigram Language Model: \n') LunigramLM = LaplaceUnigramLM(trainCorpus) LuniformSpell = SpellCorrection(LunigramLM, trainCorpus) LunigramOutput,t = LuniformSpell.evaluation(testCorpus) f.write(str(LunigramOutput)) f.write('\nTime to run (seconds): ') f.write(str(t)+'\n') f.write('\n') print ('Laplace Bigram Language Model Evaluation') f.write('Laplace Bigram Language Model: \n') LbigramLM = LaplaceBigramLM(trainCorpus) LbigramSpell = SpellCorrection(LbigramLM, trainCorpus) LbigramOutput,t = LbigramSpell.evaluation(testCorpus) f.write(str(LbigramOutput)) f.write('\nTime to run (seconds): ') f.write(str(t)+'\n') f.write('\n') print ('Stupid Backoff Language Model Evaluation') f.write('Stupid Backoff Language Model: \n') SBOLM = StupidBackoffLM(trainCorpus) SBOSpell = SpellCorrection(SBOLM, trainCorpus) SBOOutput,t = SBOSpell.evaluation(testCorpus) f.write(str(SBOOutput)) f.write('\nTime to run (seconds): ') f.write(str(t)+'\n') f.write('\n') print ('Stupid Backoff with Add-one Smoothing Language Model Evaluation') f.write('Stupid Backoff with Add-one Smoothing Language Model: \n') SBOASLM = StupidBackoffSmoothLM(trainCorpus) SBOASSpell = SpellCorrection(SBOASLM, trainCorpus) SBOASOutput,t = SBOASSpell.evaluation(testCorpus) f.write(str(SBOASOutput)) f.write('\nTime to run (seconds): ') f.write(str(t)+'\n') f.write('\n') print ('Modified Kneser Ney Smoothing Language Model Evaluation') f.write('Modified Kneser Ney Smoothing Language Model: \n') MKNLM = MKneserNeyLM(trainCorpus) MKNSpell = SpellCorrection(MKNLM, trainCorpus) MKNOutput,t = MKNSpell.evaluation(testCorpus) f.write(str(MKNOutput)) f.write('\nTime to run (seconds): ') f.write(str(t)+'\n') f.write('\n') print ('Katz Backoff Smoothing Language Model Evaluation') f.write('Katz Backoff Smoothing Language Model: \n') KBOLM = KatzBackoffGTLM(trainCorpus) KBOSpell = SpellCorrection(KBOLM, trainCorpus) KBOOutput,t = KBOSpell.evaluation(testCorpus) f.write(str(KBOOutput)) f.write('\nTime to run (seconds): ') f.write(str(t)+'\n')