Ejemplo n.º 1
0
    def output(self, partId, ch_aux):
        """Uses the student code to compute the output for test cases."""
        trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')

        if partId in [1, 2]:
            editModel = EditModel('../data/count_1edit.txt', trainCorpus)
            return json.dumps([[(e.editedWord, e.rule())
                                for e in editModel.edits(line.strip())]
                               for line in ch_aux.split("\n")])
        else:
            testCorpus = HolbrookCorpus()
            testCorpus.slurpString(ch_aux)
            lm = None
            if partId in [3, 4]:
                lm = LaplaceUnigramLanguageModel(trainCorpus)
            elif partId in [5, 6]:
                lm = LaplaceBigramLanguageModel(trainCorpus)
            elif partId in [7, 8]:
                lm = StupidBackoffLanguageModel(trainCorpus)
            elif partId in [9, 10]:
                lm = CustomLanguageModel(trainCorpus)
            else:
                print 'Unknown partId: " + partId'
                return None

            speller = SpellCorrect(lm, trainCorpus)
            output = speller.correctCorpus(testCorpus)
            # put in the part ID as well
            output = '[["%d"],%s' % (partId, output[1:])
            return output
Ejemplo n.º 2
0
 def __init__(self, corpus):
     """Initialize your data structures in the constructor."""
     # TODO your code here
     self.bigramCount = collections.defaultdict(lambda: 0)
     self.uniGram = LaplaceUnigramLanguageModel(corpus)
     self.train(corpus)
     self.vocab = len(self.bigramCount.keys())
Ejemplo n.º 3
0
 def __init__(self, corpus):
     """Initialize your data structures in the constructor."""
     # TODO your code here
     unigramModel = LaplaceUnigramLanguageModel(corpus)
     self.total = unigramModel.total
     bigramModel = LaplaceBigramLanguageModel(corpus)
     self.UnigramCounts = unigramModel.LaplaceUnigramCounts
     self.BigramCounts = bigramModel.LaplaceBigramCounts
Ejemplo n.º 4
0
 def __init__(self, corpus):
   """Initialize your data structures in the constructor."""
   # TODO your code here
   model = LaplaceUnigramLanguageModel(corpus)
   self.Unicounts = model.LaplaceUnigramCounts
   self.newCounts = copy.copy(self.Unicounts)
   self.N = model.total
   self.N_1 = self.Unicounts.values().count(1)
   self.train()
Ejemplo n.º 5
0
 def __init__(self, corpus):
   """Initialize your data structures in the constructor."""
   # TODO your code here
   uniModel = LaplaceUnigramLanguageModel(corpus)
   self.LaplaceUnigramCounts = uniModel.LaplaceUnigramCounts
   self.LaplaceBigramCounts = Counter()
   #The 'total' varaible is totally useless in this file, but i'm going
   #to use it in other files.
   self.total = uniModel.total
   self.train(corpus)
Ejemplo n.º 6
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
    unigramOutcome = unigramSpell.evaluate(devCorpus)
    print str(unigramOutcome)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome)

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome)

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome)

    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome)

    print 'Custom Language Model (based on LaplaceBigramLanguageModel): '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome)

    print 'Custom Language Model2 (based on StupidBackoffLanguageModel): '
    customLM2 = CustomLanguageModel2(trainingCorpus)
    customSpell2 = SpellCorrect(customLM2, trainingCorpus)
    customOutcome2 = customSpell2.evaluate(devCorpus)
    print str(customOutcome2)
Ejemplo n.º 7
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome), '\n'

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome), '\n'

    #It has (accuracy: 0.012739) because of the small corpus (I think ^_^)
    print 'Good-Turing Unigram Language Model: '
    GoodTuringLM = GoodTuringUnigramLanguageModel(trainingCorpus)
    GoodTuringSpell = SpellCorrect(GoodTuringLM, trainingCorpus)
    GoodTuringOutcome = GoodTuringSpell.evaluate(devCorpus)
    print str(GoodTuringOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome), '\n'
Ejemplo n.º 8
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data.
    """

    trainPath = '../data/micro/en_US/'
    trainingCorpus = CapstoneCorpus(trainPath)
    #print str(trainingCorpus)

    sent = "When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
    tokens = Tokenize(sent)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(uniformLM.words))
    print sent
    print tokens
    print "uniform score=" + str(uniformLM.score(tokens))

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    print "VocSize= " + str(len(unigramLM.unigramCounts))
    print "unigram score=" + str(unigramLM.score(tokens))

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramLM.save("smallUnigram.LM")
    print "VocSize= " + str(len(laplaceUnigramLM.f1))
    print "unigram score=" + str(laplaceUnigramLM.score(tokens))

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramLM.save("smallBigram.LM")
    print "bigram score=" + str(laplaceBigramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=2'
    laplaceN2gramLM = LaplaceNgramLanguageModel(trainingCorpus, 2)
    laplaceN2gramLM.save("smallN2gram.LM")
    print "N=2gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Laplace Ngram Language Model: N=3'
    laplaceN3gramLM = LaplaceNgramLanguageModel(trainingCorpus, 3)
    laplaceN3gramLM.save("smallN3gram.LM")
    print "N=3gram score=" + str(laplaceN2gramLM.score(tokens))

    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus, N=2)
    print "Custom LM score=" + str(customLM.score(tokens))
Ejemplo n.º 9
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    #  print('Uniform Language Model: ')
    #  uniformLM = UniformLanguageModel(trainingCorpus)
    #  uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    #  uniformOutcome = uniformSpell.evaluate(devCorpus)
    #  print(str(uniformOutcome))

    print('\nLaplace Unigram Language Model: ')
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print(str(laplaceUnigramOutcome))

    print('\nLaplace Bigram Language Model: ')
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print(str(laplaceBigramOutcome))

    #  print('\nStupid Backoff Language Model: ')
    #  sbLM = StupidBackoffLanguageModel(trainingCorpus)
    #  sbSpell = SpellCorrect(sbLM, trainingCorpus)
    #  sbOutcome = sbSpell.evaluate(devCorpus)
    #  print(str(sbOutcome))
    #
    print('\nCustom Language Model: ')
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print(str(customOutcome))
Ejemplo n.º 10
0
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""

    trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')
    testCorpus = HolbrookCorpus()
    testCorpus.slurpString(ch_aux)
    lm = None
    if partId == 1 or partId == 2:
        lm = LaplaceUnigramLanguageModel(trainCorpus)
    elif partId == 3 or partId == 4:
        lm = LaplaceBigramLanguageModel(trainCorpus)
    elif partId == 5 or partId == 6:
        lm = StupidBackoffLanguageModel(trainCorpus)
    elif partId == 7 or partId == 8:
        lm = CustomLanguageModel(trainCorpus)
    else:
        print('Unknown partId: " + partId')
        return None

    speller = SpellCorrect(lm, trainCorpus)
    output = speller.correctCorpus(testCorpus)
    # put in the part ID as well
    output = '[["%d"],%s' % (partId, output[1:])
    return output