Exemple #1
0
def smooth(tagged_list, prime=0):
    brown_words = remove_tags(tagged_list)

    if prime == 0:
        return brown_words

    if prime == 1:
        x_prime = remove_words(tagged_list)
        U, B, T = ax.countNgrams(x_prime, 0)
        word_tag_dict = word_tag(tagged_list)

        Uy, By, Ty = ax.countNgrams(brown_words, 0)
        bigram_dict = prime_bigram(By, word_tag_dict)

        trigram_dict = prime_trigram(Ty, word_tag_dict)

        full_data = [U, bigram_dict, trigram_dict]
        return entropy(full_data, 1)

    x_prime = remove_words(tagged_list)
    word_tag_dict = word_tag(tagged_list)
    U, B, T = ax.countNgrams(x_prime, 0)
    Uy, By, Ty = ax.countNgrams(brown_words, 0)
    xy_bigram_dict = xy_prime_bigram(By, word_tag_dict)
    xy_trigram_dict = xy_prime_trigram(Ty, word_tag_dict)
    full_data = [U, xy_bigram_dict, xy_trigram_dict]
    return entropy(full_data, 1)
def get_trigram_entropy(words, start, end):
  total_words = end
  U, B, T = auxiliar.countNgrams(words,start, end)

  u_prob = helper.unigram_prob(U,float(total_words))
  b_prob, b_posibilities = helper.bigram_prob(B,U)
  t_prob, t_posibilities = helper.trigram_prob(T,B)

  t_h = helper.trigram_model(U, b_posibilities, t_posibilities, u_prob, b_prob, t_prob)
  return t_h
def compute_models():
  file = 'corpora/en.txt'
  words = auxiliar.getWordsFromFile(file)
  total_words = float(len(words))

  U, B, T = auxiliar.countNgrams(words,0, 0)

  u_prob = helper.unigram_prob(U,total_words)
  b_prob, b_posibilities = helper.bigram_prob(B,U)
  t_prob, t_posibilities = helper.trigram_prob(T,B)

  u_h = helper.unigram_model(U, u_prob)
  b_h = helper.bigram_model (U, b_posibilities, u_prob, b_prob)
  t_h = helper.trigram_model(U, b_posibilities, t_posibilities, u_prob, b_prob, t_prob)

  return u_h, b_h, t_h
Exemple #4
0
def entropy(word_list, flag=0):
    if flag == 0:
        U, B, T = ax.countNgrams(word_list, 0)
    else:
        U, B, T = word_list

    freq_uni = freq_count(U)
    uni_prob = prob_x(U, freq_uni)
    h_uni_gram = uni_entropy(uni_prob)

    bi_prob = prob_yx(B, U)
    h_bi_gram = bi_entropy(bi_prob, uni_prob)

    tri_prob = prob_zxy(T, B)
    h_tri_gram = tri_entropy(tri_prob, bi_prob, uni_prob)

    return (h_uni_gram, h_bi_gram, h_tri_gram)
Exemple #5
0
def get_brown_tri(tagged_data):
	words_without_tag = []
	for word,tag in tagged_data:
		words_without_tag.append(word)
	return words_without_tag

def get_brown_tags_uni(tagged_data):
	tags = []
	for word,tag in tagged_data[0]:
		tags.append(tag)
	return tags


# Get Data
n_grams_en = ax.countNgrams(enWords, 0)
uni_gram_dict = filter_data(n_grams_en, 1)

freq_uni = freq(uni_gram_dict)
uni_prob = prob_x(uni_gram_dict, freq_uni)
H_uni = uni_entropy(uni_prob)


bi_gram_dict = filter_data(n_grams_en, 2)
bi_prob = prob_yx(bi_gram_dict, uni_gram_dict)
H_bi =  bi_entropy(bi_prob, uni_prob)

tri_gram_dict = filter_data(n_grams_en, 3)
tri_prob = prob_zxy(tri_gram_dict, bi_gram_dict)
H_tri = tri_entropy(tri_prob, bi_prob, uni_prob)
Exemple #6
0
        if (l[i - 1][xIsTag], l[i][yIsTag]) not in B:
            B[(l[i - 1][xIsTag], l[i][yIsTag])] = 1
        else:
            B[(l[i - 1][xIsTag], l[i][yIsTag])] += 1
        if (l[i - 2][xIsTag], l[i - 1][yIsTag], l[i][0]) not in T:
            T[(l[i - 2][xIsTag], l[i - 1][yIsTag], l[i][0])] = 1
        else:
            T[(l[i - 2][xIsTag], l[i - 1][yIsTag], l[i][0])] += 1
    return (U, B, T)


#------------------------------------MAIN-------------------------------

words = getWordsFromFile('en.txt')

(unicount, bicount, tricount) = countNgrams(words, 0)

print("unigram")
print(unigramEntropy(unicount, bicount, tricount, len(words)))
print("perplexity unigram")
print(math.pow(2, unigramEntropy(unicount, bicount, tricount, len(words))))
print("bigram")
print(bigramEntropy(unicount, bicount, tricount, len(words)))
print("perplexity bigram")
print(math.pow(2, bigramEntropy(unicount, bicount, tricount, len(words))))
print("trigram")
print(trigramEntropy(unicount, bicount, tricount, len(words)))
print("perplexity trigram")
print(math.pow(2, trigramEntropy(unicount, bicount, tricount, len(words))))

print("(x,y,z), words, full")