def main():
  global nGramList, tokenList
  print "Perplexity for Dataset 3 using Laplace Smoothing"
  nGramList = task_one.generateNgram(N,task_one.corpus3) 
  fileText = task_one.readFile(task_one.corpus3test)
  tokenList = task_one.modifyFile(fileText)
  extendedMain()
Ejemplo n.º 2
0
def main():
    global nGramList, tokenList
    print "Perplexity for Dataset 4 using Laplace Smoothing"
    nGramList = task_one.generateNgram(N, task_one.corpus4)
    fileText = task_one.readFile(task_one.corpus4test)
    tokenList = task_one.modifyFile(fileText)
    extendedMain()
Ejemplo n.º 3
0
#Task 3 : Handle unknown words and implement smoothing

from __future__ import division
import math

import task_one

N = 2
print "Perplexity for Dataset 3 using Laplace Smoothing"
nGramList = task_one.generateNgram(N,task_one.corpus3) 
laplaceList = []
fileText = task_one.readFile(task_one.corpus3test)
tokenList = task_one.modifyFile(fileText)
nGramTestList = []

def findVocabCount():
  vocabCount = 0
  unigram = nGramList[0]
  for t in unigram:
    vocabCount = vocabCount + unigram[t]
  
  return vocabCount

def combineTokens(listWithUnknown):
  dictTokensWithUnknown = dict()
  #print listWithUnknown
  for n in listWithUnknown:
    dictTokensWithUnknown = dict(dictTokensWithUnknown.items() + n.items())
    
  #print dictTokensWithUnknown
  return dictTokensWithUnknown