file.write('p(the | all): {}\n'.format(probs[word_index_dict['all'], word_index_dict['the']])) file.write('p(jury | the): {}\n'.format(probs[word_index_dict['the'], word_index_dict['jury']])) file.write('p(campaign | the): {}\n'.format( probs[word_index_dict['the'], word_index_dict['campaign']])) file.write('p(calls | anonymous): {}\n'.format( probs[word_index_dict['anonymous'], word_index_dict['calls']])) #np.savetxt('bigram_probs.txt', probs, fmt="%f") fw = open('bigram_eval.txt', 'w') with open('toy_corpus.txt', encoding='utf-16') as toy: #toy = toy.read() for line in toy: sent_prob = 1 words = line.strip().split(" ") sent_len = len(words) for word1, word2 in (words[i:i + 2] for i in range(0, len(words) - 1)): sent_prob *= probs[word_index_dict[word1.strip().lower()]][ word_index_dict[word2.strip().lower()]] perplexity = 1 / (pow(sent_prob, 1.0 / sent_len)) fw.write(str(perplexity) + '\n') fw.close() with open('bigram_generation.txt', 'w') as f: for i in range(10): f.write(GENERATE(word_index_dict, probs, "bigram", 10, '<s>') + '\n')
# problem 6: Calculating sentence probabilities sentence = codecs.open("toy_corpus.txt", "r", "utf-16") output = open("smoothed_eval.txt", "w") m = 0 for line in sentence: previous_word = "<s>" line = line.lstrip("<s>") m += 1 sentprob = 1 n = 1 for word in line.split(): word = word.lower() sentprob *= probs[word_index_dict[previous_word], word_index_dict[word]] n += 1 previous_word = word #sentence_probs.append(sentprob) perplexity = 1 / pow(sentprob, 1.0 / n) output.write("The probability of sentence %d is: %e\nThe perplexity of sentence %d is: %e \n" % (m, sentprob, m, perplexity)) #print(sentprob) #print(perplexity) sentence.close() output.close() generation = open("smoothed_generation.txt", "w") for x in range(0, 10): generation.write(GENERATE(word_index_dict, probs, "bigram", 15, "<s>") + "\n") generation.close()
print(str(probs[word_index_dict['anonymous']][word_index_dict['calls']]), file=output_file) output_file.close() # below is the code for problem6 f = codecs.open("toy_corpus.txt", 'r', encoding="utf-16") output_file = open('smoothed_eval.txt', 'w') for line in f: words = line.split() del words[0] # not sure if it is correct sentprob = 1 previous_word = '<s>' for w in words: w = w.lower() sentprob *= probs[word_index_dict[previous_word]][word_index_dict[w]] previous_word = w # print(str(sentprob), file=output_file) sent_len = len( words) # not sure if there should be some modification on length perplexity = 1 / (pow(sentprob, 1.0 / sent_len)) print(str(perplexity), file=output_file) f.close() output_file.close() # below is the code for problem 7 output_file = open('smoothed_generation.txt', 'w') print(GENERATE(word_index_dict, probs, 'bigram', 10, '<s>'), file=output_file) output_file.close()
output = open("bigram_eval.txt", "w") m = 0 for line in sentence: previous_word = "<s>" line = line.lstrip("<s>") m += 1 sentprob = 1 n = 1 for word in line.split(): word = word.lower() sentprob *= probs[word_index_dict[previous_word], word_index_dict[word]] n += 1 previous_word = word #sentence_probs.append(sentprob) perplexity = 1 / pow(sentprob, 1.0 / n) output.write( "The probability of sentence %d is: %e\nThe perplexity of sentence %d is: %e \n" % (m, sentprob, m, perplexity)) #print(sentprob) #print(perplexity) sentence.close() output.close() generation = open("bigram_generation.txt", "w") for x in range(0, 10): generation.write( GENERATE(word_index_dict, probs, "bigram", 15, "<s>") + "\n") generation.close()