def getBigramPerplexity():
    '''
        Controller method to find the perplexity of all the test books with all the genres
    '''
    bigram_model = loadBigramModels()

    #Reads in the unigrams one file at a time and stores it with their bookname
    book_tokens = {}
    for genre in genres:
        print("\nReading test files for genre {0}".format(genre))
        for path in os.listdir(test_path + genre):
            book_tokens[path] = getTokensForFile(test_path + genre + '/' + path)

    # Construct Bigrams from Unigrams
    book_bigrams = {}
    for path, tokens in book_tokens.iteritems():
        book_bigrams[path] = [(tokens[i], tokens[i+1]) for i in range(0, len(tokens)-1)]

    # Predict Bigram Perplexity
    book_perplexity = defaultdict(dict)
    for book, bigrams in book_bigrams.iteritems():
        for genre in genres:
            book_perplexity[book][genre] = computeBigramPerplexity(bigram_model[genre], bigrams)
            print("Perplexity of '{0}' book on {1} genre model: {2}".format(book, genre, book_perplexity[book][genre]))

    return book_perplexity
def getBigramsForGenre(dir_path, unknown_words = True):
    '''
        Reads through the contents of a complete directory path and finds
        the bigrams present in a genre level corpus
    '''
    genre_tokens               = []
    genre_startchar_successors = []
    
    for path in os.listdir(dir_path):
        
        #Reading the file's contents and getting the tokens
        if not path.startswith('.'):
            tokens = getTokensForFile(dir_path + '/' + path)
            genre_tokens.extend(tokens)

    #Finding the list of words that are sentence starters in the current corpus
    genre_startchar_successors.extend(getStartCharSuccessorsForGenre(genre_tokens))

    #Modifying the list of tokens by inserting <UNKNOWN> for tokens that occur only once
    mod_tokens = insertUnknownWords(genre_tokens)

    #Create a list of bigrams from the tokens
    #Will include the bigrams spanning the end of one file to the beginning of the next.
    # but that doesn't really matter (~5 bigrams out of 10s of thousands)
    genre_bigram = [(mod_tokens[i], mod_tokens[i+1]) for i in range(0, len(mod_tokens)-1)]

    return genre_bigram, genre_startchar_successors
def getUnigramFrequenciesforTrainingSet():
    '''
        Wrapper method to get the unigram frequency distribution across all 
        genres
    '''
    unigram_frequencies = {}
    for genre in genres:
        print("\nReading files for genre {0}".format(genre))
        word_list = []
        
        #Reads in the unigrams one file at a time
        for path in os.listdir(training_path + genre):
            if not path.startswith('.'):
                word_list.extend(getTokensForFile(training_path + genre + '/' + path))
        
        #Creating a counter of the frequencies at the genre level
        unigram_frequencies[genre] = Counter(word_list)
    
    return unigram_frequencies
def getUnigramPerplexity():
    '''
        Controller method to find the perplexity of all the test books with all the genres
    '''
    unigram_model = loadUnigramModels()
    
    #Reads in the unigrams one file at a time and stores it with their bookname
    book_tokens = {}
    for genre in genres:
        print("\nReading test files for genre {0}".format(genre))
        for path in os.listdir(test_path + genre):
            book_tokens[path] = getTokensForFile(test_path + genre + '/' + path)
    
    #Computes the perplixity of every test corpus against each of the unigram models created
    book_perplexity = defaultdict(dict)
    for book, unigrams in book_tokens.iteritems():
        print('')
        for genre, model in unigram_model.iteritems():
            book_perplexity[book][genre] = computeUnigramPerplexity(model, unigrams)
            print("Perplexity of '{0}' book on {1} genre model: {2}".format(book,genre,book_perplexity[book][genre]))
    
    return book_perplexity