예제 #1
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data.
    """
  
    trainPath = '../data/micro/en_US/'
    trainingCorpus = CapstoneCorpus(trainPath)
    #print str(trainingCorpus)
  
    sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
    tokens = Tokenize(sent)
  
    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(uniformLM.words))
    print sent
    print tokens
    print "uniform score=" + str(uniformLM.score(tokens))
  
    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(unigramLM.unigramCounts))
    print "unigram score=" + str(unigramLM.score(tokens))
  
    print 'Laplace Unigram Language Model: ' 
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramLM.save("smallUnigram.LM")
    print "VocSize= " + str(len(laplaceUnigramLM.f1))
    print "unigram score=" + str(laplaceUnigramLM.score(tokens))
  
    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramLM.save("smallBigram.LM")
    print "bigram score=" + str(laplaceBigramLM.score(tokens))
  
    print 'Laplace Ngram Language Model: N=2'
    laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus,2)
    laplaceN2gramLM.save("smallN2gram.LM")
    print "N=2gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=3'
    laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus,3)
    laplaceN3gramLM.save("smallN3gram.LM")
    print "N=3gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus,N=2)
    print "Custom LM score=" + str(customLM.score(tokens))
예제 #2
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data.
    """

    trainPath = '../data/micro/en_US/'
    trainingCorpus = CapstoneCorpus(trainPath)
    #print str(trainingCorpus)

    sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
    tokens = Tokenize(sent)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(uniformLM.words))
    print sent
    print tokens
    print "uniform score=" + str(uniformLM.score(tokens))

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(unigramLM.unigramCounts))
    print "unigram score=" + str(unigramLM.score(tokens))

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramLM.save("smallUnigram.LM")
    print "VocSize= " + str(len(laplaceUnigramLM.f1))
    print "unigram score=" + str(laplaceUnigramLM.score(tokens))

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramLM.save("smallBigram.LM")
    print "bigram score=" + str(laplaceBigramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=2'
    laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus, 2)
    laplaceN2gramLM.save("smallN2gram.LM")
    print "N=2gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=3'
    laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus, 3)
    laplaceN3gramLM.save("smallN3gram.LM")
    print "N=3gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus, N=2)
    print "Custom LM score=" + str(customLM.score(tokens))
예제 #3
0
    def output(self, partId, ch_aux):
        """Uses the student code to compute the output for test cases."""
        trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')

        if partId in [1, 2]:
            editModel = EditModel('../data/count_1edit.txt', trainCorpus)
            return json.dumps([[(e.editedWord, e.rule())
                                for e in editModel.edits(line.strip())]
                               for line in ch_aux.split("\n")])
        else:
            testCorpus = HolbrookCorpus()
            testCorpus.slurpString(ch_aux)
            lm = None
            if partId in [3, 4]:
                lm = LaplaceUnigramLanguageModel(trainCorpus)
            elif partId in [5, 6]:
                lm = LaplaceBigramLanguageModel(trainCorpus)
            elif partId in [7, 8]:
                lm = StupidBackoffLanguageModel(trainCorpus)
            elif partId in [9, 10]:
                lm = CustomLanguageModel(trainCorpus)
            else:
                print 'Unknown partId: " + partId'
                return None

            speller = SpellCorrect(lm, trainCorpus)
            output = speller.correctCorpus(testCorpus)
            # put in the part ID as well
            output = '[["%d"],%s' % (partId, output[1:])
            return output
예제 #4
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
    unigramOutcome = unigramSpell.evaluate(devCorpus)
    print str(unigramOutcome)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome)

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome)

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome)

    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome)

    print 'Custom Language Model (based on LaplaceBigramLanguageModel): '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome)

    print 'Custom Language Model2 (based on StupidBackoffLanguageModel): '
    customLM2 = CustomLanguageModel2(trainingCorpus)
    customSpell2 = SpellCorrect(customLM2, trainingCorpus)
    customOutcome2 = customSpell2.evaluate(devCorpus)
    print str(customOutcome2)
예제 #5
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome), '\n'

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome), '\n'

    #It has (accuracy: 0.012739) because of the small corpus (I think ^_^)
    print 'Good-Turing Unigram Language Model: '
    GoodTuringLM = GoodTuringUnigramLanguageModel(trainingCorpus)
    GoodTuringSpell = SpellCorrect(GoodTuringLM, trainingCorpus)
    GoodTuringOutcome = GoodTuringSpell.evaluate(devCorpus)
    print str(GoodTuringOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome), '\n'
예제 #6
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    #  print('Uniform Language Model: ')
    #  uniformLM = UniformLanguageModel(trainingCorpus)
    #  uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    #  uniformOutcome = uniformSpell.evaluate(devCorpus)
    #  print(str(uniformOutcome))

    print('\nLaplace Unigram Language Model: ')
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print(str(laplaceUnigramOutcome))

    print('\nLaplace Bigram Language Model: ')
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print(str(laplaceBigramOutcome))

    #  print('\nStupid Backoff Language Model: ')
    #  sbLM = StupidBackoffLanguageModel(trainingCorpus)
    #  sbSpell = SpellCorrect(sbLM, trainingCorpus)
    #  sbOutcome = sbSpell.evaluate(devCorpus)
    #  print(str(sbOutcome))
    #
    print('\nCustom Language Model: ')
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print(str(customOutcome))
예제 #7
0
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""

    trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')
    testCorpus = HolbrookCorpus()
    testCorpus.slurpString(ch_aux)
    lm = None
    if partId == 1 or partId == 2:
        lm = LaplaceUnigramLanguageModel(trainCorpus)
    elif partId == 3 or partId == 4:
        lm = LaplaceBigramLanguageModel(trainCorpus)
    elif partId == 5 or partId == 6:
        lm = StupidBackoffLanguageModel(trainCorpus)
    elif partId == 7 or partId == 8:
        lm = CustomLanguageModel(trainCorpus)
    else:
        print('Unknown partId: " + partId')
        return None

    speller = SpellCorrect(lm, trainCorpus)
    output = speller.correctCorpus(testCorpus)
    # put in the part ID as well
    output = '[["%d"],%s' % (partId, output[1:])
    return output
예제 #8
0
# bad_model = CustomLanguageModel(bad_train, n=n, coef=1)
#
# print('Evaluation of the models...')
# true = 0
# for i in range(len(good_test)):
#     print(str(i) + '/' + str(len(good_test)))
#     if good_model.score(good_test[i]) > bad_model.score(good_test[i]):
#         true += 1
#     if good_model.score(bad_test[i]) < bad_model.score(bad_test[i]):
#         true += 1
#
# print('accuracy ' + str(n) + '-gram, coef=1: ' + str(true / (len(good_test) * 2)))
#
#################################################
# coef = 0.4
print('Creation of the models...')
good_model = CustomLanguageModel(good_train, n=n, coef=0.4)
bad_model = CustomLanguageModel(bad_train, n=n, coef=0.4)

print('Evaluation of the models...')
true = 0
t = tqdm.tqdm(total=len(good_test))
for i in range(len(good_test)):
    t.update()
    if good_model.score(good_test[i]) > bad_model.score(good_test[i]):
        true += 1
    if good_model.score(bad_test[i]) < bad_model.score(bad_test[i]):
        true += 1
t.close()

print('accuracy ' + str(n) + '-gram, coef=0.4: ' + str(true / (len(good_test) * 2)))