Esempio n. 1
0
def good_turing_interpolation():
	bins_tri = {}
	bins_bi = {}
	bins_uni = {}
	for key, value in sorted(trigram_dict.iteritems()):
		if value in bins_tri:
			bins_tri[value] += 1
		else:
			bins_tri[value] = 1
	for key, value in sorted(bigram_dict.iteritems()):
		if value in bins_bi:
			bins_bi[value] += 1
		else:
			bins_bi[value] = 1
	for key, value in sorted(unigram_dict.iteritems()):
		if value in bins_uni:
			bins_uni[value] += 1
		else:
			bins_uni[value] = 1

	f = open('English.txt', 'r')
	i = 1
	print 'Probabilities'
	for line in f.readlines():
		
		final_prob = float(0)

		probability = float(1)
		lis_tokens, dic_tokens = trigram_generator(line)
		for token in lis_tokens:
			probability = probability*float(calculate_prob(token, bins_tri, trigram_dict, trigram_len)) 
		
		final_prob += 0.5*probability

		probability = float(1)
		lis_tokens, dic_tokens = bigram_generator(line)
		for token in lis_tokens:
			probability = probability*float(calculate_prob(token, bins_bi, bigram_dict, bigram_len)) 
		
		final_prob += 0.3*probability

		probability = float(1)
		lis_tokens, dic_tokens = unigram_generator(line)
		for token in lis_tokens:
			probability = probability*float(calculate_prob(token, bins_uni, unigram_dict, unigram_len)) 
				
		final_prob += 0.2*probability

		print 'line', i, ':', final_prob
		i = i+1
Esempio n. 2
0
def good_turing_smoothing():
    bins_tri = {}
    for key, value in sorted(trigram_dict.iteritems()):
        if value in bins_tri:
            bins_tri[value] += 1
        else:
            bins_tri[value] = 1
            # print bins_tri
    op = open("English.txt", "r")
    for line in op.readlines():
        prob = 1
        [new_token, tokendictionary1] = add_tri(line)
        for tokens_tri in new_token:
            prob = prob * float(calculate_good_next(bins_tri, tokens_tri, trigram_len, trigram_dict))
        print prob
Esempio n. 3
0
def good_turing():
	bins = {}
	for key, value in sorted(trigram_dict.iteritems()):
		if value in bins:
			bins[value] += 1
		else:
			bins[value] = 1
	f = open('English.txt', 'r')
	i = 1
	print 'Probabilities'
	for line in f.readlines():
		probability = float(1)
		lis_tokens, dic_tokens = trigram_generator(line)
		for token in lis_tokens:
			probability = probability*float(calculate_prob(token, bins, trigram_dict, trigram_len)) 
		print 'line', i, ':', probability
		i = i+1
Esempio n. 4
0
def good_turing_interpolation():
    bins_tri = {}
    bins_bi = {}
    bins_uni = {}
    for key, value in sorted(trigram_dict.iteritems()):
        if value in bins_tri:
            bins_tri[value] += 1
        else:
            bins_tri[value] = 1
            # print bins_tri
    for key, value in sorted(bigram_dict.iteritems()):
        if value in bins_bi:
            bins_bi[value] += 1
        else:
            bins_bi[value] = 1

    for key, value in sorted(unigram_dict.iteritems()):
        if value in bins_uni:
            bins_uni[value] += 1
        else:
            bins_uni[value] = 1

    op = open("English.txt", "r")
    for line in op.readlines():
        prob = 1
        prob_add = 0
        [new_token, tokendictionary1] = add_tri(line)
        for tokens_tri in new_token:
            prob = prob * float(calculate_good_next(bins_tri, tokens_tri, trigram_len, trigram_dict))
            # print prob
        prob_add = 0.5 * prob
        prob = 1
        [new_token, tokendictionary1] = add_bi(line)
        for tokens_bi in new_token:
            prob = prob * float(calculate_good_next(bins_bi, tokens_bi, bigram_len, bigram_dict))
            # print prob
        prob_add += 0.3 * prob
        prob = 1
        [new_token, tokendictionary1] = add_uni(line)
        for tokens_uni in new_token:
            prob = prob * float(calculate_good_next(bins_uni, tokens_uni, unigram_len, unigram_dict))
        prob_add += 0.2 * prob
        print prob_add