예제 #1
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data.
    """
  
    trainPath = '../data/micro/en_US/'
    trainingCorpus = CapstoneCorpus(trainPath)
    #print str(trainingCorpus)
  
    sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
    tokens = Tokenize(sent)
  
    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(uniformLM.words))
    print sent
    print tokens
    print "uniform score=" + str(uniformLM.score(tokens))
  
    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(unigramLM.unigramCounts))
    print "unigram score=" + str(unigramLM.score(tokens))
  
    print 'Laplace Unigram Language Model: ' 
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramLM.save("smallUnigram.LM")
    print "VocSize= " + str(len(laplaceUnigramLM.f1))
    print "unigram score=" + str(laplaceUnigramLM.score(tokens))
  
    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramLM.save("smallBigram.LM")
    print "bigram score=" + str(laplaceBigramLM.score(tokens))
  
    print 'Laplace Ngram Language Model: N=2'
    laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus,2)
    laplaceN2gramLM.save("smallN2gram.LM")
    print "N=2gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=3'
    laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus,3)
    laplaceN3gramLM.save("smallN3gram.LM")
    print "N=3gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus,N=2)
    print "Custom LM score=" + str(customLM.score(tokens))
예제 #2
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data.
    """

    trainPath = '../data/micro/en_US/'
    trainingCorpus = CapstoneCorpus(trainPath)
    #print str(trainingCorpus)

    sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
    tokens = Tokenize(sent)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(uniformLM.words))
    print sent
    print tokens
    print "uniform score=" + str(uniformLM.score(tokens))

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(unigramLM.unigramCounts))
    print "unigram score=" + str(unigramLM.score(tokens))

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramLM.save("smallUnigram.LM")
    print "VocSize= " + str(len(laplaceUnigramLM.f1))
    print "unigram score=" + str(laplaceUnigramLM.score(tokens))

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramLM.save("smallBigram.LM")
    print "bigram score=" + str(laplaceBigramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=2'
    laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus, 2)
    laplaceN2gramLM.save("smallN2gram.LM")
    print "N=2gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=3'
    laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus, 3)
    laplaceN3gramLM.save("smallN3gram.LM")
    print "N=3gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus, N=2)
    print "Custom LM score=" + str(customLM.score(tokens))
예제 #3
0
    def output(self, partId, ch_aux):
        """Uses the student code to compute the output for test cases."""
        trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')

        if partId in [1, 2]:
            editModel = EditModel('../data/count_1edit.txt', trainCorpus)
            return json.dumps([[(e.editedWord, e.rule())
                                for e in editModel.edits(line.strip())]
                               for line in ch_aux.split("\n")])
        else:
            testCorpus = HolbrookCorpus()
            testCorpus.slurpString(ch_aux)
            lm = None
            if partId in [3, 4]:
                lm = LaplaceUnigramLanguageModel(trainCorpus)
            elif partId in [5, 6]:
                lm = LaplaceBigramLanguageModel(trainCorpus)
            elif partId in [7, 8]:
                lm = StupidBackoffLanguageModel(trainCorpus)
            elif partId in [9, 10]:
                lm = CustomLanguageModel(trainCorpus)
            else:
                print 'Unknown partId: " + partId'
                return None

            speller = SpellCorrect(lm, trainCorpus)
            output = speller.correctCorpus(testCorpus)
            # put in the part ID as well
            output = '[["%d"],%s' % (partId, output[1:])
            return output
예제 #4
0
 def train(self, corpus):
     """ Takes a corpus and trains your language model.
         Compute any counts or other corpus statistics in this function.
     """
     self.bigram = LaplaceBigramLanguageModel(corpus)
     self.uniGramCount = self.bigram.uniGram.uniDict
     self.biGramCount = self.bigram.bigramCount
 def __init__(self, corpus):
   """Initialize your data structures in the constructor."""
   # TODO your code here
   self.BLM = LaplaceBigramLanguageModel(corpus)
   self.ULM = LaplaceUnigramLanguageModel(corpus)
   self.discount = 0.75 
   self.ends_with = dict() #number of bigrams that ends with a particular word
   self.train(corpus)
예제 #6
0
 def __init__(self, corpus):
     """Initialize your data structures in the constructor."""
     self.trigramCount = collections.defaultdict(lambda: 0)
     bg = LaplaceBigramLanguageModel(corpus)
     self.bigramCount = bg.bigramCount
     self.uniGram = bg.uniGram.uniDict
     self.train(corpus)
     self.vocab = len(self.bigramCount.keys())
class StupidBackoffLanguageModel:

  def __init__(self, corpus):
    """Initialize your data structures in the constructor."""
    # TODO your code here
    self.ULM = LaplaceUnigramLanguageModel(corpus)
    self.BLM = LaplaceBigramLanguageModel(corpus)
    self.train(corpus)

  def train(self, corpus):
    """ Takes a corpus and trains your language model. 
        Compute any counts or other corpus statistics in this function.
    """  
    # TODO your code here
    self.ULM.train(corpus)
    self.BLM.train(corpus)
    pass

  def score(self, sentence):
    """ Takes a list of strings as argument and returns the log-probability of the 
        sentence using your language model. Use whatever data you computed in train() here.
    """
    result = 0.0
    for i in range(len(sentence)-1):
      first = sentence[i]
      second = sentence[i+1]
      if(first in self.BLM.bigram):
        if(second in self.BLM.bigram[first]):
          #do not use backoff
          numer = self.BLM.bigram[first].get(second)
          denom = sum( self.BLM.bigram[first].values())
          result += math.log(float(numer)/denom )
        else:
          #use backoff
          result += self.ULM.score(second)
      else:
        #the first word does not appear
        result += self.ULM.score(first)
    # TODO your code here
    return result
예제 #8
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
    unigramOutcome = unigramSpell.evaluate(devCorpus)
    print str(unigramOutcome)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome)

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome)

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome)

    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome)

    print 'Custom Language Model (based on LaplaceBigramLanguageModel): '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome)

    print 'Custom Language Model2 (based on StupidBackoffLanguageModel): '
    customLM2 = CustomLanguageModel2(trainingCorpus)
    customSpell2 = SpellCorrect(customLM2, trainingCorpus)
    customOutcome2 = customSpell2.evaluate(devCorpus)
    print str(customOutcome2)
예제 #9
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome), '\n'

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome), '\n'

    #It has (accuracy: 0.012739) because of the small corpus (I think ^_^)
    print 'Good-Turing Unigram Language Model: '
    GoodTuringLM = GoodTuringUnigramLanguageModel(trainingCorpus)
    GoodTuringSpell = SpellCorrect(GoodTuringLM, trainingCorpus)
    GoodTuringOutcome = GoodTuringSpell.evaluate(devCorpus)
    print str(GoodTuringOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome), '\n'
예제 #10
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    #  print('Uniform Language Model: ')
    #  uniformLM = UniformLanguageModel(trainingCorpus)
    #  uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    #  uniformOutcome = uniformSpell.evaluate(devCorpus)
    #  print(str(uniformOutcome))

    print('\nLaplace Unigram Language Model: ')
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print(str(laplaceUnigramOutcome))

    print('\nLaplace Bigram Language Model: ')
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print(str(laplaceBigramOutcome))

    #  print('\nStupid Backoff Language Model: ')
    #  sbLM = StupidBackoffLanguageModel(trainingCorpus)
    #  sbSpell = SpellCorrect(sbLM, trainingCorpus)
    #  sbOutcome = sbSpell.evaluate(devCorpus)
    #  print(str(sbOutcome))
    #
    print('\nCustom Language Model: ')
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print(str(customOutcome))
예제 #11
0
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""

    trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')
    testCorpus = HolbrookCorpus()
    testCorpus.slurpString(ch_aux)
    lm = None
    if partId == 1 or partId == 2:
        lm = LaplaceUnigramLanguageModel(trainCorpus)
    elif partId == 3 or partId == 4:
        lm = LaplaceBigramLanguageModel(trainCorpus)
    elif partId == 5 or partId == 6:
        lm = StupidBackoffLanguageModel(trainCorpus)
    elif partId == 7 or partId == 8:
        lm = CustomLanguageModel(trainCorpus)
    else:
        print('Unknown partId: " + partId')
        return None

    speller = SpellCorrect(lm, trainCorpus)
    output = speller.correctCorpus(testCorpus)
    # put in the part ID as well
    output = '[["%d"],%s' % (partId, output[1:])
    return output
class CustomLanguageModel:
#Bigram model with kneser-ney smoothing
  def __init__(self, corpus):
    """Initialize your data structures in the constructor."""
    # TODO your code here
    self.BLM = LaplaceBigramLanguageModel(corpus)
    self.ULM = LaplaceUnigramLanguageModel(corpus)
    self.discount = 0.75 
    self.ends_with = dict() #number of bigrams that ends with a particular word
    self.train(corpus)

  def train(self, corpus):
    """ Takes a corpus and trains your language model. 
        Compute any counts or other corpus statistics in this function.
    """  
    # TODO your code here
    self.BLM.train(corpus)
    self.ULM.train(corpus)
    for word in self.ULM.unigram.keys():
      count = 0
      for start_word in self.BLM.bigram.keys():
        if word in self.BLM.bigram[start_word]:
          count+=1 
      self.ends_with[word] = count
  def normalize(self, word):
    count_word = self.ULM.unigram.get(word,0)+1
    num_type_following_word = len(self.BLM.bigram.get(word,{})) 
    return (self.discount/count_word) * num_type_following_word
  
  def p_continuation(self,word):
    #word_hash = self.BLM.bigram.get(word,{})
    
    #number of word types followed by word
    return float(self.ends_with.get(word,0))/ self.BLM.num_types
  
  def score(self, sentence):
    """ Takes a list of strings as argument and returns the log-probability of the 
        sentence using your language model. Use whatever data you computed in train() here.
    """
    # TODO your code here
    result = 0.0
    for i in range(len(sentence)-1):
      first = sentence[i]
      second = sentence[i+1]
      if first in self.BLM.bigram:
        denom = self.ULM.unigram.get(first)
        if second in self.BLM.bigram[first]:
          #the bigram is present
          numer = max(self.BLM.bigram[first].get(second)-self.discount, 0)
        else:
          numer = 0.0
      else:
        #first is not part of any bigram
        numer = 0.0
        denom = 1.0
      
      #numer = max(self.BLM.bigram[first].get(second)-self.discount, 0)
      #denom = self.ULM.unigram.get(first,0)
      l = self.normalize(first) #lambda weight
      pc = self.p_continuation(second) #continuation probability
      prob = (float(numer)/denom) + (pc * l)
      #print prob,numer, denom, pc, l
      if prob == 0:
        result += math.log(10e-15)
      else:
        result += math.log(prob)
    #print result
    return result
예제 #13
0
    def translate(self, corpus_filename, tagged_corpus_filename):
        print "BEGINNING TRANSLATION\n"
        f = open(corpus_filename, 'r')
        corpus = self.read_tagged_corpus(tagged_corpus_filename)
        for sentence in corpus:
            direct_translation = []
            """
			First, just do a direct translation for testing/analysis purposes
			"""
            for word_tuple in sentence:
                word = word_tuple[0]
                tag_info = word_tuple[1]
                if word in self.punctuation or word not in self.dictionary:
                    direct_translation.append([word, word])
                else:  #it's a russian word, look it up in the dictionary
                    info = self.dictionary[word].split('.')
                    english_word = info[
                        0]  #info[1], if it exists, would be the Case (dat, gen, etc.),
                    #  but the tagger will provide this instead
                    english_word_duple = [english_word, tag_info]
                    direct_translation.append(english_word_duple)
            """
			Now we have a direct translation.
			Time for da real shiz
			Apply russian -> russian rules
			"""
            self.shto_translate(sentence)
            self.kak_translate(sentence)
            self.he_has_she_has(sentence)
            self.negation(sentence)
            """
			Now, do the "direct" translation
			"""
            translation_candidates = []
            translation_candidates.append([])
            for word_tuple in sentence:
                word = word_tuple[0]
                tag_info = word_tuple[1]
                english_candidate = word_tuple[3]
                for tc in translation_candidates[:]:
                    if word in self.punctuation:
                        tc.append([word, word])
                    elif not english_candidate == None:
                        if type(english_candidate) is str:
                            if english_candidate == "":
                                """
								There was no proposed candidate, so translate from the dictionary
								"""
                                info = self.dictionary[word].split('.')
                                english_word = info[
                                    0]  #info[1], if it exists, would be the Case (dat, gen, etc.),
                                #  but the tagger should have provided this already
                                english_word_duple = [english_word, tag_info]
                                tc.append(english_word_duple)
                            else:
                                tc.append([english_candidate, tag_info])
                        else:
                            """
							There was a list of possible english words, so generate multiple candidates
							"""
                            tc.append([english_candidate[0], tag_info])
                            for i in xrange(1, len(english_candidate)):
                                translation = tc[:-1]
                                translation.append(
                                    [english_candidate[i], tag_info])
                                translation_candidates.append(translation)
            """
			At this point, the russian -> russian rules have been applied
			Now, apply the english -> english rules, and continue keeping track of candidates
			"""
            #translation_candidates = [ translation[:] ]
            #Apply Genitive rule:
            for tc in translation_candidates:
                self.interpret_genitives(tc)
            #Apply Dative rule (many possible results):
            all_results = []
            for tc in translation_candidates:
                results = self.interpret_datives(tc[:])
                for r in results:
                    all_results.append(r)
            for r in all_results:
                translation_candidates.append(r)
            #Apply reordering rule:
            old_candidates = translation_candidates[:]
            for t in old_candidates:
                new_t = self.group_nouns_with_adj(t)
                translation_candidates.append(new_t)
            #Apply articles rule:
            all_results = []
            for tc in translation_candidates:
                results = self.add_articles(tc[:])
                for r in results:
                    all_results.append(r)
            for r in all_results:
                translation_candidates.append(r)
            #Apply subjects rule:
            for tc in translation_candidates:
                self.add_subjects(tc)
            """
			Now do some purely aesthetic formatting (capitalization)
			Note this won't mess up the language model
			"""
            self.capitalize(direct_translation)
            for tc in translation_candidates:
                self.capitalize(tc)
            """
			Now, build the Language Model, which will choose the best candidate
			"""
            #DRAFT
            train_file = open('../data/language_model_training_corpus.txt')
            trainingCorpus = []
            for line in train_file:
                sentence = re.findall(r"[\w']+|[.,!?;]", line.lower())
                if len(sentence) > 0:
                    sentence = ['<s>'] + sentence + ['</s>']
                    trainingCorpus.append(sentence)
            lm = LaplaceBigramLanguageModel(trainingCorpus)
            """
			Finally, use the Language Model to pick the best candidate!
			"""
            maxScore = float("-inf")
            maxScoreSentence = ""
            for tc in translation_candidates:
                tc_string = self.translation_to_str(tc)
                sentence = re.findall(r"[\w']+|[.,!?;]", tc_string.lower())
                if len(sentence) > 0:
                    sentence = ['<s>'] + sentence + ['</s>']
                    score = lm.score(sentence)
                    #print tc_string
                    #print "\tScore: ", score
                    if score > maxScore:  #normalizing here!
                        maxScore = score
                        maxScoreSentence = tc_string
            """
			Output the results!!
			"""
            print "Original Russian sentence:"
            print f.readline()[:-1]
            print "Direct translation into English:"
            print self.translation_to_str(direct_translation)
            """
			print "All translation candidates created by our strategies:"
			for tc in translation_candidates:
				print self.translation_to_str(tc)
			"""
            print "The best translation candidate, as chosen by our Language Model:"
            print maxScoreSentence
            print ""
        print "DONE"
예제 #14
0
def langModel():
    trainPath = "es-en/train/europarl-v7.es-en.en"  #'holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)
    LM = LaplaceBigramLanguageModel(trainingCorpus)
    return LM
 def __init__(self, corpus):
   """Initialize your data structures in the constructor."""
   # TODO your code here
   self.ULM = LaplaceUnigramLanguageModel(corpus)
   self.BLM = LaplaceBigramLanguageModel(corpus)
   self.train(corpus)