def __init__(self, filename, N): """A LanguageModel object builds a model using the frequencies of the N-grams and (N-1)-grams found in the corpus, and enables the calculation of the probability of any given N-gram using that model. Args: filename (str): The name of the corpus to train on. N (int): The N value of the N-grams, where N > 1. """ if N < 2: return self.N = N self.Nfreq = countNGrams(filename, N) self.N1freq = countNGrams(filename, N-1) self.setSmoothing('no')
def main(): parsed = parseArgs(sys.argv[1:]) corpus = parsed["corpus"][0] CPF = parsed["conditional_prob_file"][0] SPF = parsed["sequence_prob_file"][0] SP = parsed["scored_permutations"] N = parsed["n"] question1(corpus) # Counting N- and (N-1)-Grams N_freq = countNGrams(corpus, N) N_min_1_freq = countNGrams(corpus, N-1) # Calculating probabilities nGramProb = NGramProbabilities(N_freq, N_min_1_freq) question2(nGramProb, CPF, N) sentenceProb = sentenceProbabilities(NGramProb, N) question3(sentenceProb, SPF, N) question4(sentenceProbs, SP)
def question1(filename, N=2, M=10): """Prints the M most frequent N-grams. Args: filename (str): The name of the corpus to extract N-grams from. N (int): (optional) The N value of the N-grams (bigrams are requested) M (int): (optional) The amount of most N-grams printed (10 is requested) """ print("Question 1") freq = countNGrams(filename, N) sortedList = sorted(freq.items(), key=itemgetter(1)) sortedList.reverse() sortedList.insert(0, ("Bigram", "Frequency")) prettyPrint(sortedList, M+1)