def __init__(self, nTag, nWord, trainingCorpus, alpha):

        self._nTag = nTag
        self._nWord = nWord
        self._alpha = alpha

        # read in the trained n-gram tagger from file
        filepath = '../taggers/t' + str(nTag) + '.pkl'
        input = open(filepath, 'rb')
        tagger = load(input)
        self._tagger = tagger
        # print(tagger)
        input.close()

        # self._ngram = NgramModel(nWord, trainingCorpus)
        # self._ngram = NgramModel(nWord, trainingCorpus, MLEProbDist, False, True)
        # self._ngram = NgramModel(nWord, trainingCorpus,LaplaceProbDist, True, True)
        self._ngram = NgramModel(nWord, trainingCorpus,WittenBellProbDist, True, True)

        #tag our own training corpus using trained Ngram Tagger
        taggedTrainingCorpus = self._tagger.tag(trainingCorpus)

        # find all tags, tagList (now replaced by self._cFdist)
        # tagList = []
        # for taggedWord in taggedTrainingCorpus:
        #     if taggedWord[1] not in tagList:
        #         tagList.append(taggedWord[1])
        # self._tagList = tagList

        size = len(taggedTrainingCorpus)

        #count conditional prob for tags, p(ti|ti-1,ti-2), only for Trigram now!!!
        cfdistTag = ConditionalFreqDist((tuple(item[1] for item in taggedTrainingCorpus[i-(nTag-1):i]),
                                         taggedTrainingCorpus[i][1])
                                        for i in range(nTag-1, size))

        # print (cfdistTag.items())
        # cfdistTag = ConditionalFreqDist(((x[1], y[1]), z[1])
        #                                for x, y, z in nltk.trigrams(taggedTrainingCorpus))

        # need to modify below code to fit into Ngram
        # cfdistTag = ConditionalFreqDist((tuple(taggedTrainingCorpus[i-(n-1):i][1]), taggedTrainingCorpus[i][1])
        #         for i in range(n-1, size))

        self._cFdistTag = cfdistTag
        cpdistTag =  ConditionalProbDist(cfdistTag, MLEProbDist)
        self._probDistTag = cpdistTag

        #count conditional prob for p(wi|ti)
        cfdist = ConditionalFreqDist((taggedTrainingCorpus[i][1], taggedTrainingCorpus[i][0])
                for i in range(size))
        self._cFdist = cfdist
        cpdist =  ConditionalProbDist(cfdist, MLEProbDist)
        self._probDist = cpdist
import corpus.lyric_corpus.corpus_access as corpus
from nGram.NgramTagModel import NgramTagModel
from nGram.nGramModel import NgramModel
from nltk.probability import *
import random
import nltk.corpus

#song corpus
corpus, trainCorpus, testCorpus, testSents = corpus.loadCorpus()

#brow corpus for comaprison
brown = nltk.corpus.brown.words()

size = int(len(brown) * 0.8)
trainBrown = brown[:size]
testBrown = brown[size:]

#Perplexity for different N-grams with back-off
for i in range(1,5):
    if i==1:
        lm1 = NgramModel( i,trainCorpus, LaplaceProbDist, False, False)
        lm2 = NgramModel(i, trainCorpus, LaplaceProbDist, False, False)
    else :
        lm1 = NgramModel( i,trainCorpus, MLEProbDist, False, True)
        lm2 = NgramModel(i, trainCorpus, MLEProbDist, False, True)
    print("N-gram :", i)
    print("Brown perplexity: ", lm1.perplexity(testBrown))
    print("Song corpus: ", lm2.perplexity(testCorpus))

__author__ = 'qrr'

#generate random sentence using simple N-gram
#do experiments and compare using different N and different corpus

import corpus.lyric_corpus.corpus_access as corpus
from nGram.nGramModel import NgramModel

# corpus, trainCorpus, testCorpus, devCorpus = corpus.loadCorpus()
# corpus, trainCorpus, testCorpus, devCorpus = corpus.loadCorpus("POP")
corpus, trainCorpus, testCorpus, devCorpus = corpus.loadCorpus("ROCK")

lm = NgramModel(4, corpus)
vocabularyNum = len(set(corpus))
print ("vocabulary number:", vocabularyNum)
print ("4-gram")
for i in range(15):
    print (lm.generateRandomSentence())



class NgramTagModel:

    def __init__(self, nTag, nWord, trainingCorpus, alpha):

        self._nTag = nTag
        self._nWord = nWord
        self._alpha = alpha

        # read in the trained n-gram tagger from file
        filepath = '../taggers/t' + str(nTag) + '.pkl'
        input = open(filepath, 'rb')
        tagger = load(input)
        self._tagger = tagger
        # print(tagger)
        input.close()

        # self._ngram = NgramModel(nWord, trainingCorpus)
        # self._ngram = NgramModel(nWord, trainingCorpus, MLEProbDist, False, True)
        # self._ngram = NgramModel(nWord, trainingCorpus,LaplaceProbDist, True, True)
        self._ngram = NgramModel(nWord, trainingCorpus,WittenBellProbDist, True, True)

        #tag our own training corpus using trained Ngram Tagger
        taggedTrainingCorpus = self._tagger.tag(trainingCorpus)

        # find all tags, tagList (now replaced by self._cFdist)
        # tagList = []
        # for taggedWord in taggedTrainingCorpus:
        #     if taggedWord[1] not in tagList:
        #         tagList.append(taggedWord[1])
        # self._tagList = tagList

        size = len(taggedTrainingCorpus)

        #count conditional prob for tags, p(ti|ti-1,ti-2), only for Trigram now!!!
        cfdistTag = ConditionalFreqDist((tuple(item[1] for item in taggedTrainingCorpus[i-(nTag-1):i]),
                                         taggedTrainingCorpus[i][1])
                                        for i in range(nTag-1, size))

        # print (cfdistTag.items())
        # cfdistTag = ConditionalFreqDist(((x[1], y[1]), z[1])
        #                                for x, y, z in nltk.trigrams(taggedTrainingCorpus))

        # need to modify below code to fit into Ngram
        # cfdistTag = ConditionalFreqDist((tuple(taggedTrainingCorpus[i-(n-1):i][1]), taggedTrainingCorpus[i][1])
        #         for i in range(n-1, size))

        self._cFdistTag = cfdistTag
        cpdistTag =  ConditionalProbDist(cfdistTag, MLEProbDist)
        self._probDistTag = cpdistTag

        #count conditional prob for p(wi|ti)
        cfdist = ConditionalFreqDist((taggedTrainingCorpus[i][1], taggedTrainingCorpus[i][0])
                for i in range(size))
        self._cFdist = cfdist
        cpdist =  ConditionalProbDist(cfdist, MLEProbDist)
        self._probDist = cpdist


    #calculate p(wi|ti)*p(ti|ti-1,ti-2)
    def probCondMulti(self, word, tag, context):
        context = tuple(context)
        wordProb = self._probDist.__getitem__(tag).prob(word)
        tagProb = self._probDistTag.__getitem__(context).prob(tag)
        return (wordProb * tagProb)


    #give wi,ti-2, ti-2, find the max {p(wi|ti)*p(ti|ti-1,ti-2)}
    def nextWordTag(self, word, context):
        maxValue = -1
        for tag in self._cFdist:
            if(self._probDist.__getitem__(tag).prob(word) != 0):
                tmpValue = self.probCondMulti(word, tag, context)
                if(tmpValue > maxValue):
                    maxValue = tmpValue
        return maxValue

    def linearCombination(self, contextWords, contextTags):

        contextWords = contextWords[len(contextWords) - (self._nWord-1) : len(contextWords)]
        contextTags = contextTags[len(contextTags) - (self._nTag-1) : len(contextTags)]
        alpha = self._alpha
        # print("CONTEXT WORDS", contextWords)
        # print("CONTEXT TAGS", contextTags)
        nextWords = self._ngram.wordsInContext(contextWords)
        # print("NEXT WORDS", nextWords)
        maxValue = -1
        maxWord = ""
        for word in nextWords :
            prob = alpha * self._ngram.prob(word, contextWords) + (1-alpha)* self.nextWordTag(word, contextTags)
            if(prob > maxValue):
                    maxValue = prob
                    maxWord = word
        return maxWord

    def tagTestCorpus(self, testCorpus):
        return self._tagger.tag(testCorpus)

    def getRandomContext(self, testCorpus):
        numWords = max(self._nWord, self._nTag)
        size = len(testCorpus)
        seed = random.randrange(0,size-numWords-1)
        context = testCorpus[seed: seed+numWords-1]
        return(context)

    def nextWord(self, context):
        size = len(context)
        # print ("Context",context)
        listWords = [context[i][0]for i in range(size)]
        listTags = [context[i][1]for i in range(size)]
        return self.linearCombination(listWords, listTags)