Ejemplo n.º 1
0
def good_turing_interpolation():
    bins_tri = {}
    bins_bi = {}
    bins_uni = {}
    for key, value in sorted(trigram_dict.iteritems()):
        if value in bins_tri:
            bins_tri[value] += 1
        else:
            bins_tri[value] = 1
    for key, value in sorted(bigram_dict.iteritems()):
        if value in bins_bi:
            bins_bi[value] += 1
        else:
            bins_bi[value] = 1
    for key, value in sorted(unigram_dict.iteritems()):
        if value in bins_uni:
            bins_uni[value] += 1
        else:
            bins_uni[value] = 1

    f = open("Hindi.txt", "r")
    i = 1
    print "Probabilities"
    for line in f.readlines():

        final_prob = float(0)

        probability = float(1)
        lis_tokens, dic_tokens = trigram_generator(line)
        for token in lis_tokens:
            probability = probability * float(calculate_prob(token, bins_tri, trigram_dict, trigram_len))

        final_prob += 0.5 * probability

        probability = float(1)
        lis_tokens, dic_tokens = bigram_generator(line)
        for token in lis_tokens:
            probability = probability * float(calculate_prob(token, bins_bi, bigram_dict, bigram_len))

        final_prob += 0.3 * probability

        probability = float(1)
        lis_tokens, dic_tokens = unigram_generator(line)
        for token in lis_tokens:
            probability = probability * float(calculate_prob(token, bins_uni, unigram_dict, unigram_len))

        final_prob += 0.2 * probability

        print "line", i, ":", final_prob
        i = i + 1
Ejemplo n.º 2
0
def good_turing_interpolation():
	bins_tri = {}
	bins_bi = {}
	bins_uni = {}
	for key, value in sorted(trigram_dict.iteritems()):
		if value in bins_tri : 
			bins_tri[value] += 1
		else:
			bins_tri[value] = 1
	#print bins_tri
	for key, value in sorted(bigram_dict.iteritems()):
		if value in bins_bi : 
			bins_bi[value] += 1
		else:
			bins_bi[value] = 1
	
	for key, value in sorted(unigram_dict.iteritems()):
		if value in bins_uni : 
			bins_uni[value] += 1
		else:
			bins_uni[value] = 1

	op = open('Hindi.txt','r')
	for line in op.readlines():
		prob = 1
		prob_add = 0
		[new_token, tokendictionary1] = add_tri(line)
		for tokens_tri in new_token:
			prob = prob * float(calculate_good_next(bins_tri, tokens_tri, trigram_len, trigram_dict))
		#print prob
		prob_add = 0.5*prob
		prob = 1 
		[new_token, tokendictionary1] = add_bi(line)
		for tokens_bi in new_token:
			prob = prob * float(calculate_good_next(bins_bi, tokens_bi, bigram_len, bigram_dict))
		#print prob
		prob_add += 0.3*prob
		prob = 1 
		[new_token, tokendictionary1] = add_uni(line)
		for tokens_uni in new_token:
			prob = prob * float(calculate_good_next(bins_uni, tokens_uni, unigram_len, unigram_dict))
		prob_add += 0.2*prob
		print prob_add