def cross_validate(j_lines,e_lines,index,ngram_order=1,verbose=False): print "[Start] Cross Validating Data" rightfile = open("out_right.txt" % index, "w") wrongfile = open("out_wrong.txt" % index, "w") get_right = 0 total_score = 0.0 buckets = [0,0,0,0,0,0,0,0,0,0] print "[.....] Creating training and testing data sets" train_j,train_e,test_j,test_e = split_data(j_lines,e_lines,index) total_test = len(test_e) print "[Start] Converting training set to %i-Grams" % ngram_order n_gramed_japanese = [ngramer.ngram(x, ngram_order) for x in train_j] n_gramed_english = [ngramer.ngram(x, ngram_order) for x in train_e] print "[ End ] Converting training set to %i-Grams" % ngram_order model = construct_hmm(n_gramed_japanese,n_gramed_english) # NOW STAT THE TESTING PHASE # need to first also build the truth into dictionary print "[.....] Preparing test set" ground_truth = {} for i in range(len(test_e)): e = ' '.join(test_e[i]) j = ' '.join(test_j[i]) ground_truth[e] = j print "[.....] Testing" for eng in test_e: e = ' '.join(eng) score,correct,guess = test_one(model,e,ground_truth[e],ngram_order=ngram_order,verbose=verbose) if correct: get_right +=1 rightfile.write("%s,%s\n" % (e, ground_truth[e])) else: wrongfile.write("%s,%s,%s\n" % (e, ground_truth[e],guess)) total_score += score bucket = int(score*10) if bucket > 9: bucket = 9 buckets[bucket] = buckets[bucket] + 1 avg_score = total_score/total_test precision_word_wise = float(get_right)/float(total_test) print "[.....] Average Score: %f" % avg_score print "[.....] Wordwise Precision: %f" % precision_word_wise print "[.....] Buckets:" for i in range(10): print "[.....] %i to %i Percent: %i" % ((i*10), ((i+1)*10), buckets[i]) print "[ End ] Cross Validating Data" rightfile.close() wrongfile.close() return avg_score,precision_word_wise
def guess(model,teststring,ngram_order=1,verbose=False): # lable a single test string and return it's score # change "AA R B AX T ER" into ['AA','R','B', 'AX', 'T','ER'] unlabeled = teststring.split() unlabeled_ngram = ngramer.ngram(unlabeled, ngram_order) if verbose: print "Converted to %i-Gram: %s " % (ngram_order, unlabeled_ngram) result = model.tag(unlabeled_ngram) # Fetch the (possibly N-Gramed) result result_ngram = [] for align in result: result_ngram.append(align[1]) if verbose: print "Recovered: %s " % (result_ngram) # Un-NGram the result result_recovered = ngramer.unngram(result_ngram) if verbose: print "Fixed to: %s " % (result_recovered) # Construct the guess from this array kata_guess = "".join(result_recovered) return kata_guess
def smalltest(dictionary_file, ngram_order=1, verbose=False): import ngramer japanese, english = read_dictionary(dictionary_file) aligned_japanese, aligned_english = align_phoneme_sets(japanese, english) print "[Start] Converting dictionary to %i-Grams" % ngram_order n_gramed_japanese = [ngramer.ngram(x, ngram_order) for x in aligned_japanese] n_gramed_english = [ngramer.ngram(x, ngram_order) for x in aligned_english] print "[ End ] Converting dictionary to %i-Grams" % ngram_order model = construct_hmm(n_gramed_japanese,n_gramed_english) true_japanese = "A - BA n" english = "ER B AX N" test_one(model,"AA M AX N D AX","A - MO n DO",ngram_order,verbose)
def newwordtest(dictionary_file, wordfile, ngram_order=1, verbose=False): import ngramer japanese, english = read_dictionary(dictionary_file) aligned_japanese, aligned_english = align_phoneme_sets(japanese, english) print "[Start] Converting dictionary to %i-Grams" % ngram_order n_gramed_japanese = [ngramer.ngram(x, ngram_order) for x in aligned_japanese] n_gramed_english = [ngramer.ngram(x, ngram_order) for x in aligned_english] print "[ End ] Converting dictionary to %i-Grams" % ngram_order model = construct_hmm(n_gramed_japanese,n_gramed_english) testfile = open(wordfile) testlines = testfile.readlines() for i in range(len(testlines)): line = testlines[i] tup = line.split(",") english = tup[0] sound = tup[1] kguess = guess(model,sound,ngram_order,verbose) print "%s -> %s -> %s" % (english, sound, kguess)