Ejemplo n.º 1
0
def cross_validate(j_lines,e_lines,index,ngram_order=1,verbose=False):
	print "[Start] Cross Validating Data"
	
	rightfile = open("out_right.txt" % index, "w")
	wrongfile = open("out_wrong.txt" % index, "w")
	
	get_right = 0
 	total_score = 0.0
	buckets = [0,0,0,0,0,0,0,0,0,0]
	
	print "[.....] Creating training and testing data sets"
	train_j,train_e,test_j,test_e = split_data(j_lines,e_lines,index)
 	total_test = len(test_e)

	print "[Start] Converting training set to %i-Grams" % ngram_order
	n_gramed_japanese = [ngramer.ngram(x, ngram_order) for x in train_j]
	n_gramed_english = [ngramer.ngram(x, ngram_order) for x in train_e]
	print "[ End ] Converting training set to %i-Grams" % ngram_order

	model = construct_hmm(n_gramed_japanese,n_gramed_english)
	
	# NOW STAT THE TESTING PHASE
	# need to first also build the truth into dictionary
	
	print "[.....] Preparing test set"
	ground_truth = {}
	for i in range(len(test_e)):
		e = ' '.join(test_e[i])
		j = ' '.join(test_j[i])
		ground_truth[e] = j
	
	print "[.....] Testing"
	for eng in test_e:
		e = ' '.join(eng)
 		score,correct,guess = test_one(model,e,ground_truth[e],ngram_order=ngram_order,verbose=verbose)
 		if correct:
 			get_right +=1
			rightfile.write("%s,%s\n" % (e, ground_truth[e]))
		else:
			wrongfile.write("%s,%s,%s\n" % (e, ground_truth[e],guess))
 		total_score += score
		bucket = int(score*10)
		if bucket > 9:
			bucket = 9
		buckets[bucket] = buckets[bucket] + 1
	
 	avg_score = total_score/total_test
 	precision_word_wise = float(get_right)/float(total_test)
	print "[.....] Average Score: %f" % avg_score
	print "[.....] Wordwise Precision: %f" % precision_word_wise
	print "[.....] Buckets:" 
	for i in range(10):
		print "[.....] %i to %i Percent: %i" % ((i*10), ((i+1)*10), buckets[i])
		
 	print "[ End ] Cross Validating Data"
	rightfile.close()
	wrongfile.close()
 	return avg_score,precision_word_wise
Ejemplo n.º 2
0
def guess(model,teststring,ngram_order=1,verbose=False):
	# lable a single test string and return it's score
	# change "AA R B AX T ER" into ['AA','R','B', 'AX', 'T','ER']
	unlabeled = teststring.split()
	unlabeled_ngram = ngramer.ngram(unlabeled, ngram_order)
	if verbose:
		print "Converted to %i-Gram: %s " % (ngram_order, unlabeled_ngram)
		
	result = model.tag(unlabeled_ngram)	
		
 	# Fetch the (possibly N-Gramed) result
	result_ngram = []
	for align in result:
		result_ngram.append(align[1])
		
	if verbose:
		print "Recovered: %s " % (result_ngram)
 	
 	# Un-NGram the result
 	result_recovered = ngramer.unngram(result_ngram)

	if verbose:
		print "Fixed to: %s " % (result_recovered)

 	# Construct the guess from this array
	kata_guess = "".join(result_recovered)
	return kata_guess
Ejemplo n.º 3
0
def smalltest(dictionary_file, ngram_order=1, verbose=False):
	import ngramer
	
	japanese, english = read_dictionary(dictionary_file)
	aligned_japanese, aligned_english = align_phoneme_sets(japanese, english)
	
	print "[Start] Converting dictionary to %i-Grams" % ngram_order
	n_gramed_japanese = [ngramer.ngram(x, ngram_order) for x in aligned_japanese]
	n_gramed_english = [ngramer.ngram(x, ngram_order) for x in aligned_english]
	print "[ End ] Converting dictionary to %i-Grams" % ngram_order
 	
	model = construct_hmm(n_gramed_japanese,n_gramed_english)

  	true_japanese = "A - BA n"
	english = "ER B AX N"

	test_one(model,"AA M AX N D AX","A - MO n DO",ngram_order,verbose)
Ejemplo n.º 4
0
def newwordtest(dictionary_file, wordfile, ngram_order=1, verbose=False):
	import ngramer
	
	japanese, english = read_dictionary(dictionary_file)
	aligned_japanese, aligned_english = align_phoneme_sets(japanese, english)
	
	print "[Start] Converting dictionary to %i-Grams" % ngram_order
	n_gramed_japanese = [ngramer.ngram(x, ngram_order) for x in aligned_japanese]
	n_gramed_english = [ngramer.ngram(x, ngram_order) for x in aligned_english]
	print "[ End ] Converting dictionary to %i-Grams" % ngram_order
 	
	model = construct_hmm(n_gramed_japanese,n_gramed_english)
	
	testfile = open(wordfile)
	testlines = testfile.readlines()
	
	for i in range(len(testlines)):
		line = testlines[i]
		tup  = line.split(",")
		english = tup[0]
		sound   = tup[1]
		kguess = guess(model,sound,ngram_order,verbose)
		print "%s -> %s -> %s" % (english, sound, kguess)