Example #1
0
  def output(self, partId, ch_aux):
    """Uses the student code to compute the output for test cases."""
    trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')

    if partId in [1,2]:
      editModel = EditModel('../data/count_1edit.txt', trainCorpus)
      return json.dumps([[(e.editedWord, e.rule()) for e in editModel.edits(line.strip())] for line in ch_aux.split("\n")])
    else:
      testCorpus = HolbrookCorpus()
      testCorpus.slurpString(ch_aux)
      lm = None
      if partId in [3,4]:
        lm = LaplaceUnigramLanguageModel(trainCorpus)
      elif partId in [5,6]:
        lm = LaplaceBigramLanguageModel(trainCorpus)
      elif partId in [7,8]:
        lm = StupidBackoffLanguageModel(trainCorpus)
      elif partId in [9,10]:
        lm = CustomLanguageModel(trainCorpus)
      else:
        print 'Unknown partId: " + partId'
        return None

      speller = SpellCorrect(lm, trainCorpus)
      output = speller.correctCorpus(testCorpus)
      # put in the part ID as well
      output = '[["%d"],%s' % (partId, output[1:])
      return output
Example #2
0
    def output(self, partId, ch_aux):
        """Uses the student code to compute the output for test cases."""
        trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')

        if partId in [1, 2]:
            editModel = EditModel('../data/count_1edit.txt', trainCorpus)
            return json.dumps([[(e.editedWord, e.rule())
                                for e in editModel.edits(line.strip())]
                               for line in ch_aux.split("\n")])
        else:
            testCorpus = HolbrookCorpus()
            testCorpus.slurpString(ch_aux)
            lm = None
            if partId in [3, 4]:
                lm = LaplaceUnigramLanguageModel(trainCorpus)
            elif partId in [5, 6]:
                lm = LaplaceBigramLanguageModel(trainCorpus)
            elif partId in [7, 8]:
                lm = StupidBackoffLanguageModel(trainCorpus)
            elif partId in [9, 10]:
                lm = CustomLanguageModel(trainCorpus)
            else:
                print 'Unknown partId: " + partId'
                return None

            speller = SpellCorrect(lm, trainCorpus)
            output = speller.correctCorpus(testCorpus)
            # put in the part ID as well
            output = '[["%d"],%s' % (partId, output[1:])
            return output
Example #3
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Unigram Language Model: '
    unigramLM = UnigramLanguageModel(trainingCorpus)
    unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
    unigramOutcome = unigramSpell.evaluate(devCorpus)
    print str(unigramOutcome)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome)

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome)

    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome)

    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome)

    print 'Custom Language Model (based on LaplaceBigramLanguageModel): '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome)

    print 'Custom Language Model2 (based on StupidBackoffLanguageModel): '
    customLM2 = CustomLanguageModel2(trainingCorpus)
    customSpell2 = SpellCorrect(customLM2, trainingCorpus)
    customOutcome2 = customSpell2.evaluate(devCorpus)
    print str(customOutcome2)
Example #4
0
def main():
  """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
  trainPath = 'data/holbrook-tagged-train.dat'
  trainingCorpus = HolbrookCorpus(trainPath)

  devPath = 'data/holbrook-tagged-dev.dat'
  devCorpus = HolbrookCorpus(devPath)

  print('Stupid Backoff Language Model: ')
  sbLM = StupidBackoffLanguageModel(trainingCorpus)
  sbSpell = SpellCorrect(sbLM, trainingCorpus)
  sbOutcome = sbSpell.evaluate(devCorpus)
  print(str(sbOutcome))
Example #5
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    print 'Uniform Language Model: '
    uniformLM = UniformLanguageModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome), '\n'

    print 'Laplace Unigram Language Model: '
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print str(laplaceUnigramOutcome), '\n'

    #It has (accuracy: 0.012739) because of the small corpus (I think ^_^)
    print 'Good-Turing Unigram Language Model: '
    GoodTuringLM = GoodTuringUnigramLanguageModel(trainingCorpus)
    GoodTuringSpell = SpellCorrect(GoodTuringLM, trainingCorpus)
    GoodTuringOutcome = GoodTuringSpell.evaluate(devCorpus)
    print str(GoodTuringOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Laplace Bigram Language Model: '
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print str(laplaceBigramOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Stupid Backoff Language Model: '
    sbLM = StupidBackoffLanguageModel(trainingCorpus)
    sbSpell = SpellCorrect(sbLM, trainingCorpus)
    sbOutcome = sbSpell.evaluate(devCorpus)
    print str(sbOutcome), '\n'

    #This model takes some time, about (70) seconds
    print 'Custom Language Model: '
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome), '\n'
Example #6
0
def main():
  """Sanity checks the edit model on the word 'hi'."""

  trainPath = 'data/holbrook-tagged-train.dat'
  trainingCorpus = HolbrookCorpus(trainPath)
  editModel = EditModel("data/count_1edit.txt", trainingCorpus)
  #These are for testing, you can ignore them
  DELETE_EDITS = set(['Edit(editedWord=i, rule=<h|<)', 'Edit(editedWord=h, rule=hi|h)'])
  INSERT_EDITS = set([Edit('ahi','<','<a'),Edit('bhi','<','<b'),Edit('chi','<','<c'),Edit('dhi','<','<d'),Edit('ehi','<','<e'),Edit('fhi','<','<f'),Edit('ghi','<','<g'),Edit('hhi','<','<h'),Edit('ihi','<','<i'),Edit('jhi','<','<j'),Edit('khi','<','<k'),Edit('lhi','<','<l'),Edit('mhi','<','<m'),Edit('nhi','<','<n'),Edit('ohi','<','<o'),Edit('phi','<','<p'),Edit('qhi','<','<q'),
    Edit('rhi','<','<r'),Edit('shi','<','<s'),Edit('thi','<','<t'),Edit('uhi','<','<u'),Edit('vhi','<','<v'),Edit('whi','<','<w'),Edit('xhi','<','<x'),Edit('yhi','<','<y'),Edit('zhi','<','<z'),Edit('hai','h','ha'),Edit('hbi','h','hb'),Edit('hci','h','hc'),Edit('hdi','h','hd'),Edit('hei','h','he'),Edit('hfi','h','hf'),Edit('hgi','h','hg'),Edit('hhi','h','hh'),
    Edit('hii','h','hi'),Edit('hji','h','hj'),Edit('hki','h','hk'),Edit('hli','h','hl'),Edit('hmi','h','hm'),Edit('hni','h','hn'),Edit('hoi','h','ho'),Edit('hpi','h','hp'),Edit('hqi','h','hq'),Edit('hri','h','hr'),Edit('hsi','h','hs'),Edit('hti','h','ht'),Edit('hui','h','hu'),Edit('hvi','h','hv'),Edit('hwi','h','hw'),Edit('hxi','h','hx'),Edit('hyi','h','hy'),Edit('hzi','h','hz'),
    Edit('hia','i','ia'),Edit('hib','i','ib'),Edit('hic','i','ic'),Edit('hid','i','id'),Edit('hie','i','ie'),Edit('hif','i','if'),Edit('hig','i','ig'),Edit('hih','i','ih'),Edit('hii','i','ii'),Edit('hij','i','ij'),Edit('hik','i','ik'),Edit('hil','i','il'),Edit('him','i','im'),Edit('hin','i','in'),Edit('hio','i','io'),Edit('hip','i','ip'),Edit('hiq','i','iq'),Edit('hir','i','ir'),
    Edit('his','i','is'),Edit('hit','i','it'),Edit('hiu','i','iu'),Edit('hiv','i','iv'),Edit('hiw','i','iw'),Edit('hix','i','ix'),Edit('hiy','i','iy'),Edit('hiz','i','iz')])
  TRANPOSE_EDITS = set([Edit('ih','hi','ih')])
  REPLACE_EDITS = set([Edit('ai','h','a'),Edit('bi','h','b'),Edit('ci','h','c'),Edit('di','h','d'),Edit('ei','h','e'),Edit('fi','h','f'),Edit('gi','h','g'),Edit('ii','h','i'),Edit('ji','h','j'),
    Edit('ki','h','k'),Edit('li','h','l'),Edit('mi','h','m'),Edit('ni','h','n'),Edit('oi','h','o'),Edit('pi','h','p'),Edit('qi','h','q'),Edit('ri','h','r'),Edit('si','h','s'),Edit('ti','h','t'),
    Edit('ui','h','u'),Edit('vi','h','v'),Edit('wi','h','w'),Edit('xi','h','x'),Edit('yi','h','y'),Edit('zi','h','z'),Edit('ha','i','a'),Edit('hb','i','b'),Edit('hc','i','c'),Edit('hd','i','d'),Edit('he','i','e'),Edit('hf','i','f'),Edit('hg','i','g'),Edit('hh','i','h'),Edit('hj','i','j'),
    Edit('hk','i','k'),Edit('hl','i','l'),Edit('hm','i','m'),Edit('hn','i','n'),Edit('ho','i','o'),Edit('hp','i','p'),Edit('hq','i','q'),Edit('hr','i','r'),Edit('hs','i','s'),Edit('ht','i','t'),
    Edit('hu','i','u'),Edit('hv','i','v'),Edit('hw','i','w'),Edit('hx','i','x'),Edit('hy','i','y'),Edit('hz','i','z')])

  print("***Code Sanity Check***")
  print("Delete edits for 'hi'")
  checkOverlap(set(editModel.deleteEdits('hi')), DELETE_EDITS)
  print("Insert edits for 'hi'")
  checkOverlap(set(editModel.insertEdits('hi')), INSERT_EDITS)
  print("Transpose edits for 'hi'")
  checkOverlap(set(editModel.transposeEdits('hi')), TRANPOSE_EDITS)
  print("Replace edits for 'hi'")
  checkOverlap(set(editModel.replaceEdits('hi')), REPLACE_EDITS)
Example #7
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""
    trainPath = '../data/holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)

    devPath = '../data/holbrook-tagged-dev.dat'
    devCorpus = HolbrookCorpus(devPath)

    #  print('Uniform Language Model: ')
    #  uniformLM = UniformLanguageModel(trainingCorpus)
    #  uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    #  uniformOutcome = uniformSpell.evaluate(devCorpus)
    #  print(str(uniformOutcome))

    print('\nLaplace Unigram Language Model: ')
    laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
    laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
    laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
    print(str(laplaceUnigramOutcome))

    print('\nLaplace Bigram Language Model: ')
    laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
    laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
    laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
    print(str(laplaceBigramOutcome))

    #  print('\nStupid Backoff Language Model: ')
    #  sbLM = StupidBackoffLanguageModel(trainingCorpus)
    #  sbSpell = SpellCorrect(sbLM, trainingCorpus)
    #  sbOutcome = sbSpell.evaluate(devCorpus)
    #  print(str(sbOutcome))
    #
    print('\nCustom Language Model: ')
    customLM = CustomLanguageModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print(str(customOutcome))
Example #8
0
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""

    trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')
    testCorpus = HolbrookCorpus()
    testCorpus.slurpString(ch_aux)
    lm = None
    if partId == 1 or partId == 2:
        lm = LaplaceUnigramLanguageModel(trainCorpus)
    elif partId == 3 or partId == 4:
        lm = LaplaceBigramLanguageModel(trainCorpus)
    elif partId == 5 or partId == 6:
        lm = StupidBackoffLanguageModel(trainCorpus)
    elif partId == 7 or partId == 8:
        lm = CustomLanguageModel(trainCorpus)
    else:
        print('Unknown partId: " + partId')
        return None

    speller = SpellCorrect(lm, trainCorpus)
    output = speller.correctCorpus(testCorpus)
    # put in the part ID as well
    output = '[["%d"],%s' % (partId, output[1:])
    return output
Example #9
0
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""

    trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')
    testCorpus = HolbrookCorpus()
    testCorpus.slurpString(ch_aux)
    lm = None
    if partId == 1 or partId == 2:
        lm = LaplaceUnigramLanguageModel(trainCorpus)
    elif partId == 3 or partId == 4:
        lm = LaplaceBigramLanguageModel(trainCorpus)
    elif partId == 5 or partId == 6:
        lm = StupidBackoffLanguageModel(trainCorpus)
    elif partId == 7 or partId == 8:
        lm = CustomLanguageModel(trainCorpus)
    else:
        print 'Unknown partId: " + partId'
        return None

    speller = SpellCorrect(lm, trainCorpus)
    output = speller.correctCorpus(testCorpus)
    # put in the part ID as well
    output = '[["%d"],%s' % (partId, output[1:])
    return output
def langModel():
    trainPath = "es-en/train/europarl-v7.es-en.en"  #'holbrook-tagged-train.dat'
    trainingCorpus = HolbrookCorpus(trainPath)
    LM = LaplaceBigramLanguageModel(trainingCorpus)
    return LM
#!/usr/bin/env python
# -*- coding: utf-8 -*

import numpy
import nltk
from nltk.tag.stanford import POSTagger
from Datum import Datum
from Sentence import Sentence
from HolbrookCorpus import HolbrookCorpus
from StupidBackoffLanguageModel import StupidBackoffLanguageModel

### Test Bigram Backoff Language model
eng_corpus = HolbrookCorpus('holbrook-tagged-train.dat')
eng_model = StupidBackoffLanguageModel(eng_corpus)

sentence = 'what do you want to eat for dinner'
print("Score for sentence \"" + sentence + "\": " +
      str(eng_model.score(sentence.split())))

sentence = 'what do you want to eat for dinner'
print("Score for sentence \"" + sentence + "\": " +
      str(eng_model.score(sentence.split())))

### Test POS
## Configure this to be your Java directory
# nltk.internals.config_java(u"C:/Program Files/Java/jre7/bin/java.exe")

# chunk = u"古往今来 , 有 多少 的 成功者 被 人们 赞赏"

# text = nltk.word_tokenize(chunk.encode('utf-8'))
#st = POSTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar')
Example #12
0
def main():
      """
      Train all the implemented language models and test them on the test data.
      """
      # generate a corpus include a list of sentence where corrected word(misspelled word), including start/stop symbol
      # example: <s> lucky (luckily) enough it was mostly tinned (tin) food </s>
      
      # use try-except to see if file exist or path is right
      try: 
          f = open("./data/holbrook-tagged-train.dat","r")
          f.readlines()
          f.close()
      except IOError:
          print "Files not found. Check if in the right directory path!"
          
      trainPath = './data/holbrook-tagged-train.dat'
      trainCorpus = HolbrookCorpus(trainPath)
      testPath = './data/holbrook-tagged-dev.dat'
      testCorpus = HolbrookCorpus(testPath)
      
      with open('ComparisonLM.log','w') as f:
          
          f.write('Comparison of different language models: \n')
          f.write('\n')
          print ('Unigram Language Model Evaluation')
          f.write('Unigram Language Model: \n')
          unigramLM = UnigramLM(trainCorpus)
          unigramSpell = SpellCorrection(unigramLM, trainCorpus)
          unigramOutput,t = unigramSpell.evaluation(testCorpus)
          f.write(str(unigramOutput))
          f.write('\nTime to run (seconds): ')
          f.write(str(t)+'\n')
          f.write('\n')
          print ('Laplace Unigram Language Model Evaluation')
          f.write('Laplace Unigram Language Model: \n')
          LunigramLM = LaplaceUnigramLM(trainCorpus)
          LuniformSpell = SpellCorrection(LunigramLM, trainCorpus)
          LunigramOutput,t = LuniformSpell.evaluation(testCorpus)
          f.write(str(LunigramOutput))
          f.write('\nTime to run (seconds): ')
          f.write(str(t)+'\n')
          f.write('\n')
          print ('Laplace Bigram Language Model Evaluation')
          f.write('Laplace Bigram Language Model: \n')
          LbigramLM = LaplaceBigramLM(trainCorpus)
          LbigramSpell = SpellCorrection(LbigramLM, trainCorpus)
          LbigramOutput,t = LbigramSpell.evaluation(testCorpus)
          f.write(str(LbigramOutput))
          f.write('\nTime to run (seconds): ')
          f.write(str(t)+'\n')
          f.write('\n')
          print ('Stupid Backoff Language Model Evaluation')
          f.write('Stupid Backoff Language Model: \n')
          SBOLM = StupidBackoffLM(trainCorpus)
          SBOSpell = SpellCorrection(SBOLM, trainCorpus)
          SBOOutput,t = SBOSpell.evaluation(testCorpus)
          f.write(str(SBOOutput))
          f.write('\nTime to run (seconds): ')
          f.write(str(t)+'\n')
          f.write('\n')
          print ('Stupid Backoff with Add-one Smoothing Language Model Evaluation')
          f.write('Stupid Backoff with Add-one Smoothing Language Model: \n')
          SBOASLM = StupidBackoffSmoothLM(trainCorpus)
          SBOASSpell = SpellCorrection(SBOASLM, trainCorpus)
          SBOASOutput,t = SBOASSpell.evaluation(testCorpus)
          f.write(str(SBOASOutput))
          f.write('\nTime to run (seconds): ')
          f.write(str(t)+'\n')
          f.write('\n')
          print ('Modified Kneser Ney Smoothing Language Model Evaluation')
          f.write('Modified Kneser Ney Smoothing Language Model: \n')
          MKNLM = MKneserNeyLM(trainCorpus)
          MKNSpell = SpellCorrection(MKNLM, trainCorpus)
          MKNOutput,t = MKNSpell.evaluation(testCorpus)
          f.write(str(MKNOutput))
          f.write('\nTime to run (seconds): ')
          f.write(str(t)+'\n')
          f.write('\n')
          print ('Katz Backoff Smoothing Language Model Evaluation')
          f.write('Katz Backoff Smoothing Language Model: \n')
          KBOLM = KatzBackoffGTLM(trainCorpus)
          KBOSpell = SpellCorrection(KBOLM, trainCorpus)
          KBOOutput,t = KBOSpell.evaluation(testCorpus)
          f.write(str(KBOOutput))
          f.write('\nTime to run (seconds): ')
          f.write(str(t)+'\n')