Ejemplo n.º 1
0
    file.write('p(the | all): {}\n'.format(probs[word_index_dict['all'],
                                                 word_index_dict['the']]))
    file.write('p(jury | the): {}\n'.format(probs[word_index_dict['the'],
                                                  word_index_dict['jury']]))
    file.write('p(campaign | the): {}\n'.format(
        probs[word_index_dict['the'], word_index_dict['campaign']]))
    file.write('p(calls | anonymous): {}\n'.format(
        probs[word_index_dict['anonymous'], word_index_dict['calls']]))

#np.savetxt('bigram_probs.txt', probs, fmt="%f")

fw = open('bigram_eval.txt', 'w')
with open('toy_corpus.txt', encoding='utf-16') as toy:
    #toy = toy.read()
    for line in toy:

        sent_prob = 1
        words = line.strip().split(" ")
        sent_len = len(words)

        for word1, word2 in (words[i:i + 2] for i in range(0, len(words) - 1)):
            sent_prob *= probs[word_index_dict[word1.strip().lower()]][
                word_index_dict[word2.strip().lower()]]
        perplexity = 1 / (pow(sent_prob, 1.0 / sent_len))
        fw.write(str(perplexity) + '\n')
fw.close()

with open('bigram_generation.txt', 'w') as f:
    for i in range(10):
        f.write(GENERATE(word_index_dict, probs, "bigram", 10, '<s>') + '\n')
Ejemplo n.º 2
0
# problem 6: Calculating sentence probabilities
sentence = codecs.open("toy_corpus.txt", "r", "utf-16")
output = open("smoothed_eval.txt", "w")
m = 0
for line in sentence:
    previous_word = "<s>"
    line = line.lstrip("<s>")
    m += 1
    sentprob = 1
    n = 1
    for word in line.split():
        word = word.lower()
        sentprob *= probs[word_index_dict[previous_word], word_index_dict[word]]
        n += 1
        previous_word = word
    #sentence_probs.append(sentprob)
    perplexity = 1 / pow(sentprob, 1.0 / n)
    output.write("The probability of sentence %d is: %e\nThe perplexity of sentence %d is: %e \n"
                 % (m, sentprob, m, perplexity))
    #print(sentprob)
    #print(perplexity)

sentence.close()
output.close()

generation = open("smoothed_generation.txt", "w")
for x in range(0, 10):
    generation.write(GENERATE(word_index_dict, probs, "bigram", 15, "<s>") + "\n")
generation.close()
Ejemplo n.º 3
0
print(str(probs[word_index_dict['anonymous']][word_index_dict['calls']]),
      file=output_file)

output_file.close()

# below is the code for problem6
f = codecs.open("toy_corpus.txt", 'r', encoding="utf-16")
output_file = open('smoothed_eval.txt', 'w')

for line in f:
    words = line.split()
    del words[0]  # not sure if it is correct
    sentprob = 1
    previous_word = '<s>'
    for w in words:
        w = w.lower()
        sentprob *= probs[word_index_dict[previous_word]][word_index_dict[w]]
        previous_word = w
    # print(str(sentprob), file=output_file)

    sent_len = len(
        words)  # not sure if there should be some modification on length
    perplexity = 1 / (pow(sentprob, 1.0 / sent_len))
    print(str(perplexity), file=output_file)
f.close()
output_file.close()

# below is the code for problem 7
output_file = open('smoothed_generation.txt', 'w')
print(GENERATE(word_index_dict, probs, 'bigram', 10, '<s>'), file=output_file)
output_file.close()
Ejemplo n.º 4
0
output = open("bigram_eval.txt", "w")
m = 0
for line in sentence:
    previous_word = "<s>"
    line = line.lstrip("<s>")
    m += 1
    sentprob = 1
    n = 1
    for word in line.split():
        word = word.lower()
        sentprob *= probs[word_index_dict[previous_word],
                          word_index_dict[word]]
        n += 1
        previous_word = word
    #sentence_probs.append(sentprob)
    perplexity = 1 / pow(sentprob, 1.0 / n)
    output.write(
        "The probability of sentence %d is: %e\nThe perplexity of sentence %d is: %e \n"
        % (m, sentprob, m, perplexity))
    #print(sentprob)
    #print(perplexity)

sentence.close()
output.close()

generation = open("bigram_generation.txt", "w")
for x in range(0, 10):
    generation.write(
        GENERATE(word_index_dict, probs, "bigram", 15, "<s>") + "\n")
generation.close()