def __init__(self, nTag, nWord, trainingCorpus, alpha): self._nTag = nTag self._nWord = nWord self._alpha = alpha # read in the trained n-gram tagger from file filepath = '../taggers/t' + str(nTag) + '.pkl' input = open(filepath, 'rb') tagger = load(input) self._tagger = tagger # print(tagger) input.close() # self._ngram = NgramModel(nWord, trainingCorpus) # self._ngram = NgramModel(nWord, trainingCorpus, MLEProbDist, False, True) # self._ngram = NgramModel(nWord, trainingCorpus,LaplaceProbDist, True, True) self._ngram = NgramModel(nWord, trainingCorpus,WittenBellProbDist, True, True) #tag our own training corpus using trained Ngram Tagger taggedTrainingCorpus = self._tagger.tag(trainingCorpus) # find all tags, tagList (now replaced by self._cFdist) # tagList = [] # for taggedWord in taggedTrainingCorpus: # if taggedWord[1] not in tagList: # tagList.append(taggedWord[1]) # self._tagList = tagList size = len(taggedTrainingCorpus) #count conditional prob for tags, p(ti|ti-1,ti-2), only for Trigram now!!! cfdistTag = ConditionalFreqDist((tuple(item[1] for item in taggedTrainingCorpus[i-(nTag-1):i]), taggedTrainingCorpus[i][1]) for i in range(nTag-1, size)) # print (cfdistTag.items()) # cfdistTag = ConditionalFreqDist(((x[1], y[1]), z[1]) # for x, y, z in nltk.trigrams(taggedTrainingCorpus)) # need to modify below code to fit into Ngram # cfdistTag = ConditionalFreqDist((tuple(taggedTrainingCorpus[i-(n-1):i][1]), taggedTrainingCorpus[i][1]) # for i in range(n-1, size)) self._cFdistTag = cfdistTag cpdistTag = ConditionalProbDist(cfdistTag, MLEProbDist) self._probDistTag = cpdistTag #count conditional prob for p(wi|ti) cfdist = ConditionalFreqDist((taggedTrainingCorpus[i][1], taggedTrainingCorpus[i][0]) for i in range(size)) self._cFdist = cfdist cpdist = ConditionalProbDist(cfdist, MLEProbDist) self._probDist = cpdist
import corpus.lyric_corpus.corpus_access as corpus from nGram.NgramTagModel import NgramTagModel from nGram.nGramModel import NgramModel from nltk.probability import * import random import nltk.corpus #song corpus corpus, trainCorpus, testCorpus, testSents = corpus.loadCorpus() #brow corpus for comaprison brown = nltk.corpus.brown.words() size = int(len(brown) * 0.8) trainBrown = brown[:size] testBrown = brown[size:] #Perplexity for different N-grams with back-off for i in range(1,5): if i==1: lm1 = NgramModel( i,trainCorpus, LaplaceProbDist, False, False) lm2 = NgramModel(i, trainCorpus, LaplaceProbDist, False, False) else : lm1 = NgramModel( i,trainCorpus, MLEProbDist, False, True) lm2 = NgramModel(i, trainCorpus, MLEProbDist, False, True) print("N-gram :", i) print("Brown perplexity: ", lm1.perplexity(testBrown)) print("Song corpus: ", lm2.perplexity(testCorpus))
__author__ = 'qrr' #generate random sentence using simple N-gram #do experiments and compare using different N and different corpus import corpus.lyric_corpus.corpus_access as corpus from nGram.nGramModel import NgramModel # corpus, trainCorpus, testCorpus, devCorpus = corpus.loadCorpus() # corpus, trainCorpus, testCorpus, devCorpus = corpus.loadCorpus("POP") corpus, trainCorpus, testCorpus, devCorpus = corpus.loadCorpus("ROCK") lm = NgramModel(4, corpus) vocabularyNum = len(set(corpus)) print ("vocabulary number:", vocabularyNum) print ("4-gram") for i in range(15): print (lm.generateRandomSentence())
class NgramTagModel: def __init__(self, nTag, nWord, trainingCorpus, alpha): self._nTag = nTag self._nWord = nWord self._alpha = alpha # read in the trained n-gram tagger from file filepath = '../taggers/t' + str(nTag) + '.pkl' input = open(filepath, 'rb') tagger = load(input) self._tagger = tagger # print(tagger) input.close() # self._ngram = NgramModel(nWord, trainingCorpus) # self._ngram = NgramModel(nWord, trainingCorpus, MLEProbDist, False, True) # self._ngram = NgramModel(nWord, trainingCorpus,LaplaceProbDist, True, True) self._ngram = NgramModel(nWord, trainingCorpus,WittenBellProbDist, True, True) #tag our own training corpus using trained Ngram Tagger taggedTrainingCorpus = self._tagger.tag(trainingCorpus) # find all tags, tagList (now replaced by self._cFdist) # tagList = [] # for taggedWord in taggedTrainingCorpus: # if taggedWord[1] not in tagList: # tagList.append(taggedWord[1]) # self._tagList = tagList size = len(taggedTrainingCorpus) #count conditional prob for tags, p(ti|ti-1,ti-2), only for Trigram now!!! cfdistTag = ConditionalFreqDist((tuple(item[1] for item in taggedTrainingCorpus[i-(nTag-1):i]), taggedTrainingCorpus[i][1]) for i in range(nTag-1, size)) # print (cfdistTag.items()) # cfdistTag = ConditionalFreqDist(((x[1], y[1]), z[1]) # for x, y, z in nltk.trigrams(taggedTrainingCorpus)) # need to modify below code to fit into Ngram # cfdistTag = ConditionalFreqDist((tuple(taggedTrainingCorpus[i-(n-1):i][1]), taggedTrainingCorpus[i][1]) # for i in range(n-1, size)) self._cFdistTag = cfdistTag cpdistTag = ConditionalProbDist(cfdistTag, MLEProbDist) self._probDistTag = cpdistTag #count conditional prob for p(wi|ti) cfdist = ConditionalFreqDist((taggedTrainingCorpus[i][1], taggedTrainingCorpus[i][0]) for i in range(size)) self._cFdist = cfdist cpdist = ConditionalProbDist(cfdist, MLEProbDist) self._probDist = cpdist #calculate p(wi|ti)*p(ti|ti-1,ti-2) def probCondMulti(self, word, tag, context): context = tuple(context) wordProb = self._probDist.__getitem__(tag).prob(word) tagProb = self._probDistTag.__getitem__(context).prob(tag) return (wordProb * tagProb) #give wi,ti-2, ti-2, find the max {p(wi|ti)*p(ti|ti-1,ti-2)} def nextWordTag(self, word, context): maxValue = -1 for tag in self._cFdist: if(self._probDist.__getitem__(tag).prob(word) != 0): tmpValue = self.probCondMulti(word, tag, context) if(tmpValue > maxValue): maxValue = tmpValue return maxValue def linearCombination(self, contextWords, contextTags): contextWords = contextWords[len(contextWords) - (self._nWord-1) : len(contextWords)] contextTags = contextTags[len(contextTags) - (self._nTag-1) : len(contextTags)] alpha = self._alpha # print("CONTEXT WORDS", contextWords) # print("CONTEXT TAGS", contextTags) nextWords = self._ngram.wordsInContext(contextWords) # print("NEXT WORDS", nextWords) maxValue = -1 maxWord = "" for word in nextWords : prob = alpha * self._ngram.prob(word, contextWords) + (1-alpha)* self.nextWordTag(word, contextTags) if(prob > maxValue): maxValue = prob maxWord = word return maxWord def tagTestCorpus(self, testCorpus): return self._tagger.tag(testCorpus) def getRandomContext(self, testCorpus): numWords = max(self._nWord, self._nTag) size = len(testCorpus) seed = random.randrange(0,size-numWords-1) context = testCorpus[seed: seed+numWords-1] return(context) def nextWord(self, context): size = len(context) # print ("Context",context) listWords = [context[i][0]for i in range(size)] listTags = [context[i][1]for i in range(size)] return self.linearCombination(listWords, listTags)