def smooth(tagged_list, prime=0): brown_words = remove_tags(tagged_list) if prime == 0: return brown_words if prime == 1: x_prime = remove_words(tagged_list) U, B, T = ax.countNgrams(x_prime, 0) word_tag_dict = word_tag(tagged_list) Uy, By, Ty = ax.countNgrams(brown_words, 0) bigram_dict = prime_bigram(By, word_tag_dict) trigram_dict = prime_trigram(Ty, word_tag_dict) full_data = [U, bigram_dict, trigram_dict] return entropy(full_data, 1) x_prime = remove_words(tagged_list) word_tag_dict = word_tag(tagged_list) U, B, T = ax.countNgrams(x_prime, 0) Uy, By, Ty = ax.countNgrams(brown_words, 0) xy_bigram_dict = xy_prime_bigram(By, word_tag_dict) xy_trigram_dict = xy_prime_trigram(Ty, word_tag_dict) full_data = [U, xy_bigram_dict, xy_trigram_dict] return entropy(full_data, 1)
def get_trigram_entropy(words, start, end): total_words = end U, B, T = auxiliar.countNgrams(words,start, end) u_prob = helper.unigram_prob(U,float(total_words)) b_prob, b_posibilities = helper.bigram_prob(B,U) t_prob, t_posibilities = helper.trigram_prob(T,B) t_h = helper.trigram_model(U, b_posibilities, t_posibilities, u_prob, b_prob, t_prob) return t_h
def compute_models(): file = 'corpora/en.txt' words = auxiliar.getWordsFromFile(file) total_words = float(len(words)) U, B, T = auxiliar.countNgrams(words,0, 0) u_prob = helper.unigram_prob(U,total_words) b_prob, b_posibilities = helper.bigram_prob(B,U) t_prob, t_posibilities = helper.trigram_prob(T,B) u_h = helper.unigram_model(U, u_prob) b_h = helper.bigram_model (U, b_posibilities, u_prob, b_prob) t_h = helper.trigram_model(U, b_posibilities, t_posibilities, u_prob, b_prob, t_prob) return u_h, b_h, t_h
def entropy(word_list, flag=0): if flag == 0: U, B, T = ax.countNgrams(word_list, 0) else: U, B, T = word_list freq_uni = freq_count(U) uni_prob = prob_x(U, freq_uni) h_uni_gram = uni_entropy(uni_prob) bi_prob = prob_yx(B, U) h_bi_gram = bi_entropy(bi_prob, uni_prob) tri_prob = prob_zxy(T, B) h_tri_gram = tri_entropy(tri_prob, bi_prob, uni_prob) return (h_uni_gram, h_bi_gram, h_tri_gram)
def get_brown_tri(tagged_data): words_without_tag = [] for word,tag in tagged_data: words_without_tag.append(word) return words_without_tag def get_brown_tags_uni(tagged_data): tags = [] for word,tag in tagged_data[0]: tags.append(tag) return tags # Get Data n_grams_en = ax.countNgrams(enWords, 0) uni_gram_dict = filter_data(n_grams_en, 1) freq_uni = freq(uni_gram_dict) uni_prob = prob_x(uni_gram_dict, freq_uni) H_uni = uni_entropy(uni_prob) bi_gram_dict = filter_data(n_grams_en, 2) bi_prob = prob_yx(bi_gram_dict, uni_gram_dict) H_bi = bi_entropy(bi_prob, uni_prob) tri_gram_dict = filter_data(n_grams_en, 3) tri_prob = prob_zxy(tri_gram_dict, bi_gram_dict) H_tri = tri_entropy(tri_prob, bi_prob, uni_prob)
if (l[i - 1][xIsTag], l[i][yIsTag]) not in B: B[(l[i - 1][xIsTag], l[i][yIsTag])] = 1 else: B[(l[i - 1][xIsTag], l[i][yIsTag])] += 1 if (l[i - 2][xIsTag], l[i - 1][yIsTag], l[i][0]) not in T: T[(l[i - 2][xIsTag], l[i - 1][yIsTag], l[i][0])] = 1 else: T[(l[i - 2][xIsTag], l[i - 1][yIsTag], l[i][0])] += 1 return (U, B, T) #------------------------------------MAIN------------------------------- words = getWordsFromFile('en.txt') (unicount, bicount, tricount) = countNgrams(words, 0) print("unigram") print(unigramEntropy(unicount, bicount, tricount, len(words))) print("perplexity unigram") print(math.pow(2, unigramEntropy(unicount, bicount, tricount, len(words)))) print("bigram") print(bigramEntropy(unicount, bicount, tricount, len(words))) print("perplexity bigram") print(math.pow(2, bigramEntropy(unicount, bicount, tricount, len(words)))) print("trigram") print(trigramEntropy(unicount, bicount, tricount, len(words))) print("perplexity trigram") print(math.pow(2, trigramEntropy(unicount, bicount, tricount, len(words)))) print("(x,y,z), words, full")