def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data. """ trainPath = '../data/micro/en_US/' trainingCorpus = CapstoneCorpus(trainPath) #print str(trainingCorpus) sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd" tokens = Tokenize(sent) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) print "VocSize= " + str(len(uniformLM.words)) print sent print tokens print "uniform score=" + str(uniformLM.score(tokens)) print 'Unigram Language Model: ' unigramLM = UnigramLanguageModel(trainingCorpus) print "VocSize= " + str(len(unigramLM.unigramCounts)) print "unigram score=" + str(unigramLM.score(tokens)) print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramLM.save("smallUnigram.LM") print "VocSize= " + str(len(laplaceUnigramLM.f1)) print "unigram score=" + str(laplaceUnigramLM.score(tokens)) print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramLM.save("smallBigram.LM") print "bigram score=" + str(laplaceBigramLM.score(tokens)) print 'Laplace Ngram Language Model: N=2' laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus,2) laplaceN2gramLM.save("smallN2gram.LM") print "N=2gram score=" + str(laplaceN2gramLM.score(tokens)) print 'Laplace Ngram Language Model: N=3' laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus,3) laplaceN3gramLM.save("smallN3gram.LM") print "N=3gram score=" + str(laplaceN2gramLM.score(tokens)) print 'Custom Language Model: ' customLM = CustomLanguageModel(trainingCorpus,N=2) print "Custom LM score=" + str(customLM.score(tokens))
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data. """ trainPath = '../data/micro/en_US/' trainingCorpus = CapstoneCorpus(trainPath) #print str(trainingCorpus) sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd" tokens = Tokenize(sent) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) print "VocSize= " + str(len(uniformLM.words)) print sent print tokens print "uniform score=" + str(uniformLM.score(tokens)) print 'Unigram Language Model: ' unigramLM = UnigramLanguageModel(trainingCorpus) print "VocSize= " + str(len(unigramLM.unigramCounts)) print "unigram score=" + str(unigramLM.score(tokens)) print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramLM.save("smallUnigram.LM") print "VocSize= " + str(len(laplaceUnigramLM.f1)) print "unigram score=" + str(laplaceUnigramLM.score(tokens)) print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramLM.save("smallBigram.LM") print "bigram score=" + str(laplaceBigramLM.score(tokens)) print 'Laplace Ngram Language Model: N=2' laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus, 2) laplaceN2gramLM.save("smallN2gram.LM") print "N=2gram score=" + str(laplaceN2gramLM.score(tokens)) print 'Laplace Ngram Language Model: N=3' laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus, 3) laplaceN3gramLM.save("smallN3gram.LM") print "N=3gram score=" + str(laplaceN2gramLM.score(tokens)) print 'Custom Language Model: ' customLM = CustomLanguageModel(trainingCorpus, N=2) print "Custom LM score=" + str(customLM.score(tokens))
def output(self, partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') if partId in [1, 2]: editModel = EditModel('../data/count_1edit.txt', trainCorpus) return json.dumps([[(e.editedWord, e.rule()) for e in editModel.edits(line.strip())] for line in ch_aux.split("\n")]) else: testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId in [3, 4]: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId in [5, 6]: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId in [7, 8]: lm = StupidBackoffLanguageModel(trainCorpus) elif partId in [9, 10]: lm = CustomLanguageModel(trainCorpus) else: print 'Unknown partId: " + partId' return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
def train(self, corpus): """ Takes a corpus and trains your language model. Compute any counts or other corpus statistics in this function. """ self.bigram = LaplaceBigramLanguageModel(corpus) self.uniGramCount = self.bigram.uniGram.uniDict self.biGramCount = self.bigram.bigramCount
def __init__(self, corpus): """Initialize your data structures in the constructor.""" # TODO your code here self.BLM = LaplaceBigramLanguageModel(corpus) self.ULM = LaplaceUnigramLanguageModel(corpus) self.discount = 0.75 self.ends_with = dict() #number of bigrams that ends with a particular word self.train(corpus)
def __init__(self, corpus): """Initialize your data structures in the constructor.""" self.trigramCount = collections.defaultdict(lambda: 0) bg = LaplaceBigramLanguageModel(corpus) self.bigramCount = bg.bigramCount self.uniGram = bg.uniGram.uniDict self.train(corpus) self.vocab = len(self.bigramCount.keys())
class StupidBackoffLanguageModel: def __init__(self, corpus): """Initialize your data structures in the constructor.""" # TODO your code here self.ULM = LaplaceUnigramLanguageModel(corpus) self.BLM = LaplaceBigramLanguageModel(corpus) self.train(corpus) def train(self, corpus): """ Takes a corpus and trains your language model. Compute any counts or other corpus statistics in this function. """ # TODO your code here self.ULM.train(corpus) self.BLM.train(corpus) pass def score(self, sentence): """ Takes a list of strings as argument and returns the log-probability of the sentence using your language model. Use whatever data you computed in train() here. """ result = 0.0 for i in range(len(sentence)-1): first = sentence[i] second = sentence[i+1] if(first in self.BLM.bigram): if(second in self.BLM.bigram[first]): #do not use backoff numer = self.BLM.bigram[first].get(second) denom = sum( self.BLM.bigram[first].values()) result += math.log(float(numer)/denom ) else: #use backoff result += self.ULM.score(second) else: #the first word does not appear result += self.ULM.score(first) # TODO your code here return result
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = '../data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = '../data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) print 'Unigram Language Model: ' unigramLM = UnigramLanguageModel(trainingCorpus) unigramSpell = SpellCorrect(unigramLM, trainingCorpus) unigramOutcome = unigramSpell.evaluate(devCorpus) print str(unigramOutcome) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) uniformSpell = SpellCorrect(uniformLM, trainingCorpus) uniformOutcome = uniformSpell.evaluate(devCorpus) print str(uniformOutcome) print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus) laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus) print str(laplaceUnigramOutcome) print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus) laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus) print str(laplaceBigramOutcome) print 'Stupid Backoff Language Model: ' sbLM = StupidBackoffLanguageModel(trainingCorpus) sbSpell = SpellCorrect(sbLM, trainingCorpus) sbOutcome = sbSpell.evaluate(devCorpus) print str(sbOutcome) print 'Custom Language Model (based on LaplaceBigramLanguageModel): ' customLM = CustomLanguageModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print str(customOutcome) print 'Custom Language Model2 (based on StupidBackoffLanguageModel): ' customLM2 = CustomLanguageModel2(trainingCorpus) customSpell2 = SpellCorrect(customLM2, trainingCorpus) customOutcome2 = customSpell2.evaluate(devCorpus) print str(customOutcome2)
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = '../data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = '../data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) print 'Uniform Language Model: ' uniformLM = UniformLanguageModel(trainingCorpus) uniformSpell = SpellCorrect(uniformLM, trainingCorpus) uniformOutcome = uniformSpell.evaluate(devCorpus) print str(uniformOutcome), '\n' print 'Laplace Unigram Language Model: ' laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus) laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus) print str(laplaceUnigramOutcome), '\n' #It has (accuracy: 0.012739) because of the small corpus (I think ^_^) print 'Good-Turing Unigram Language Model: ' GoodTuringLM = GoodTuringUnigramLanguageModel(trainingCorpus) GoodTuringSpell = SpellCorrect(GoodTuringLM, trainingCorpus) GoodTuringOutcome = GoodTuringSpell.evaluate(devCorpus) print str(GoodTuringOutcome), '\n' #This model takes some time, about (70) seconds print 'Laplace Bigram Language Model: ' laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus) laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus) print str(laplaceBigramOutcome), '\n' #This model takes some time, about (70) seconds print 'Stupid Backoff Language Model: ' sbLM = StupidBackoffLanguageModel(trainingCorpus) sbSpell = SpellCorrect(sbLM, trainingCorpus) sbOutcome = sbSpell.evaluate(devCorpus) print str(sbOutcome), '\n' #This model takes some time, about (70) seconds print 'Custom Language Model: ' customLM = CustomLanguageModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print str(customOutcome), '\n'
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = '../data/holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) devPath = '../data/holbrook-tagged-dev.dat' devCorpus = HolbrookCorpus(devPath) # print('Uniform Language Model: ') # uniformLM = UniformLanguageModel(trainingCorpus) # uniformSpell = SpellCorrect(uniformLM, trainingCorpus) # uniformOutcome = uniformSpell.evaluate(devCorpus) # print(str(uniformOutcome)) print('\nLaplace Unigram Language Model: ') laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus) laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus) laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus) print(str(laplaceUnigramOutcome)) print('\nLaplace Bigram Language Model: ') laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus) laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus) laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus) print(str(laplaceBigramOutcome)) # print('\nStupid Backoff Language Model: ') # sbLM = StupidBackoffLanguageModel(trainingCorpus) # sbSpell = SpellCorrect(sbLM, trainingCorpus) # sbOutcome = sbSpell.evaluate(devCorpus) # print(str(sbOutcome)) # print('\nCustom Language Model: ') customLM = CustomLanguageModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print(str(customOutcome))
def output(partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId == 1 or partId == 2: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId == 3 or partId == 4: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId == 5 or partId == 6: lm = StupidBackoffLanguageModel(trainCorpus) elif partId == 7 or partId == 8: lm = CustomLanguageModel(trainCorpus) else: print('Unknown partId: " + partId') return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
class CustomLanguageModel: #Bigram model with kneser-ney smoothing def __init__(self, corpus): """Initialize your data structures in the constructor.""" # TODO your code here self.BLM = LaplaceBigramLanguageModel(corpus) self.ULM = LaplaceUnigramLanguageModel(corpus) self.discount = 0.75 self.ends_with = dict() #number of bigrams that ends with a particular word self.train(corpus) def train(self, corpus): """ Takes a corpus and trains your language model. Compute any counts or other corpus statistics in this function. """ # TODO your code here self.BLM.train(corpus) self.ULM.train(corpus) for word in self.ULM.unigram.keys(): count = 0 for start_word in self.BLM.bigram.keys(): if word in self.BLM.bigram[start_word]: count+=1 self.ends_with[word] = count def normalize(self, word): count_word = self.ULM.unigram.get(word,0)+1 num_type_following_word = len(self.BLM.bigram.get(word,{})) return (self.discount/count_word) * num_type_following_word def p_continuation(self,word): #word_hash = self.BLM.bigram.get(word,{}) #number of word types followed by word return float(self.ends_with.get(word,0))/ self.BLM.num_types def score(self, sentence): """ Takes a list of strings as argument and returns the log-probability of the sentence using your language model. Use whatever data you computed in train() here. """ # TODO your code here result = 0.0 for i in range(len(sentence)-1): first = sentence[i] second = sentence[i+1] if first in self.BLM.bigram: denom = self.ULM.unigram.get(first) if second in self.BLM.bigram[first]: #the bigram is present numer = max(self.BLM.bigram[first].get(second)-self.discount, 0) else: numer = 0.0 else: #first is not part of any bigram numer = 0.0 denom = 1.0 #numer = max(self.BLM.bigram[first].get(second)-self.discount, 0) #denom = self.ULM.unigram.get(first,0) l = self.normalize(first) #lambda weight pc = self.p_continuation(second) #continuation probability prob = (float(numer)/denom) + (pc * l) #print prob,numer, denom, pc, l if prob == 0: result += math.log(10e-15) else: result += math.log(prob) #print result return result
def translate(self, corpus_filename, tagged_corpus_filename): print "BEGINNING TRANSLATION\n" f = open(corpus_filename, 'r') corpus = self.read_tagged_corpus(tagged_corpus_filename) for sentence in corpus: direct_translation = [] """ First, just do a direct translation for testing/analysis purposes """ for word_tuple in sentence: word = word_tuple[0] tag_info = word_tuple[1] if word in self.punctuation or word not in self.dictionary: direct_translation.append([word, word]) else: #it's a russian word, look it up in the dictionary info = self.dictionary[word].split('.') english_word = info[ 0] #info[1], if it exists, would be the Case (dat, gen, etc.), # but the tagger will provide this instead english_word_duple = [english_word, tag_info] direct_translation.append(english_word_duple) """ Now we have a direct translation. Time for da real shiz Apply russian -> russian rules """ self.shto_translate(sentence) self.kak_translate(sentence) self.he_has_she_has(sentence) self.negation(sentence) """ Now, do the "direct" translation """ translation_candidates = [] translation_candidates.append([]) for word_tuple in sentence: word = word_tuple[0] tag_info = word_tuple[1] english_candidate = word_tuple[3] for tc in translation_candidates[:]: if word in self.punctuation: tc.append([word, word]) elif not english_candidate == None: if type(english_candidate) is str: if english_candidate == "": """ There was no proposed candidate, so translate from the dictionary """ info = self.dictionary[word].split('.') english_word = info[ 0] #info[1], if it exists, would be the Case (dat, gen, etc.), # but the tagger should have provided this already english_word_duple = [english_word, tag_info] tc.append(english_word_duple) else: tc.append([english_candidate, tag_info]) else: """ There was a list of possible english words, so generate multiple candidates """ tc.append([english_candidate[0], tag_info]) for i in xrange(1, len(english_candidate)): translation = tc[:-1] translation.append( [english_candidate[i], tag_info]) translation_candidates.append(translation) """ At this point, the russian -> russian rules have been applied Now, apply the english -> english rules, and continue keeping track of candidates """ #translation_candidates = [ translation[:] ] #Apply Genitive rule: for tc in translation_candidates: self.interpret_genitives(tc) #Apply Dative rule (many possible results): all_results = [] for tc in translation_candidates: results = self.interpret_datives(tc[:]) for r in results: all_results.append(r) for r in all_results: translation_candidates.append(r) #Apply reordering rule: old_candidates = translation_candidates[:] for t in old_candidates: new_t = self.group_nouns_with_adj(t) translation_candidates.append(new_t) #Apply articles rule: all_results = [] for tc in translation_candidates: results = self.add_articles(tc[:]) for r in results: all_results.append(r) for r in all_results: translation_candidates.append(r) #Apply subjects rule: for tc in translation_candidates: self.add_subjects(tc) """ Now do some purely aesthetic formatting (capitalization) Note this won't mess up the language model """ self.capitalize(direct_translation) for tc in translation_candidates: self.capitalize(tc) """ Now, build the Language Model, which will choose the best candidate """ #DRAFT train_file = open('../data/language_model_training_corpus.txt') trainingCorpus = [] for line in train_file: sentence = re.findall(r"[\w']+|[.,!?;]", line.lower()) if len(sentence) > 0: sentence = ['<s>'] + sentence + ['</s>'] trainingCorpus.append(sentence) lm = LaplaceBigramLanguageModel(trainingCorpus) """ Finally, use the Language Model to pick the best candidate! """ maxScore = float("-inf") maxScoreSentence = "" for tc in translation_candidates: tc_string = self.translation_to_str(tc) sentence = re.findall(r"[\w']+|[.,!?;]", tc_string.lower()) if len(sentence) > 0: sentence = ['<s>'] + sentence + ['</s>'] score = lm.score(sentence) #print tc_string #print "\tScore: ", score if score > maxScore: #normalizing here! maxScore = score maxScoreSentence = tc_string """ Output the results!! """ print "Original Russian sentence:" print f.readline()[:-1] print "Direct translation into English:" print self.translation_to_str(direct_translation) """ print "All translation candidates created by our strategies:" for tc in translation_candidates: print self.translation_to_str(tc) """ print "The best translation candidate, as chosen by our Language Model:" print maxScoreSentence print "" print "DONE"
def langModel(): trainPath = "es-en/train/europarl-v7.es-en.en" #'holbrook-tagged-train.dat' trainingCorpus = HolbrookCorpus(trainPath) LM = LaplaceBigramLanguageModel(trainingCorpus) return LM
def __init__(self, corpus): """Initialize your data structures in the constructor.""" # TODO your code here self.ULM = LaplaceUnigramLanguageModel(corpus) self.BLM = LaplaceBigramLanguageModel(corpus) self.train(corpus)