def generateUnigramModels():
    '''
        ControllerModelAndRandomSentence for the generation of the unigram models. 
        Iterates over the given genre folders and retrieves the unigram model
        to create the final unigram model dictionary
    '''
    #Get the frequency of each word in the corpus
    unigram_frequencies = getUnigramFrequenciesforTrainingSet()
                
    #Get the word type and token count for the corpus
    unigram_features = getUnigramModelFeatures(unigram_frequencies)
    print("\n Unigram Features (Word Types, Work Tokens) {0} \n".format(unigram_features))
    
    #Returns the frequency distributions with all tokens with frequency 1 replacedby <UNKNOWN>
    #unigram_features_unknown_words = handleUnknownWords(unigram_frequencies)
    unknown_word_probs = getUnknownWordSamplingProbs(unigram_frequencies)
    
    #Performing Good Turing Smoothing. Removed this as it is not needed for unigrams
    #smoothed_frequencies = applyGoodTuringUnigramSmoothing(unigram_features_unknown_words, n = 5)
    
    #Creating the unigram model i.e. calculating the probabilities of the unigrams
    unigram_model = createUnigramModel(unigram_frequencies, unigram_features, unknown_word_probs)
     
    #Storing the model on the disk in JSON format
    serializeModelToDisk(unigram_model, 'Unigram')
    
    return unigram_model
def generateBigramModels( random_sentence = False ):
    '''
        Controller module for the generation of the bigram models. Calls the various
        methods needed to generate the model and serialise it to the disc.
    '''
    bigrams              = {}
    startchar_successors = {}
    
    # Get the bigrams in the corpus by the genre and the list of tokens that are
    # sentence starter words in the corpus
    for genre in genres:
        print("\nReading files for genre {0}".format(genre))
        path = training_path + genre
        bigrams[genre], startchar_successors[genre] = getBigramsForGenre(path)
        
    #Creating the frequency model of the bigrams
    bigram_frequencies = getBigramFrequencies(bigrams)
    
    #Adding the frequency of the bigrams that include the start character
    if random_sentence:
        bigram_frequencies_with_startChar = getStartCharBigramFrequencies(bigram_frequencies, startchar_successors)
        bigram_model = createBigramModel(bigram_frequencies_with_startChar)
        serializeModelToDisk(bigram_model, 'BigramSentenceModel')
    
    else:
        bigram_model = createBigramModel(bigram_frequencies)
        serializeModelToDisk(bigram_model, 'Bigram')
    

    return bigram_model