def execute(dataset, hmm_file, tag_file, k): hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file) ts, test_tags = data_reader.read_tagging_data(dataset) test_sentences = replace_test(ts, hmm, tagset) i = 0 conv_rates = [] # to keep track of the convergence rates varying with k for j in range(k): conv_rates.append(0.0) for sentence in test_sentences: k_best = [] if True: i += 1 truetags = test_tags[test_sentences.index(sentence)] sys.stderr.write('\n' + str(i)+ '\n') sys.stderr.write(' '.join(sentence) + "\n") print ' '.join(sentence) #TODO remove redundancy best_tags, num_iter, second_best, sb2 = dd_tagger_fst.run(sentence, tagset, hmm) conv_rates[0] += 1 k_best.append(best_tags) #next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best) if num_iter == -1: sys.stderr.write("2nd best does not converge :( \n") #print ' '.join(best_tags) #print #continue j = 2 # we have the best, and the second best now conv_rates[j-1] += 1 sys.stderr.write(str(j) + " best converges in " + str(num_iter) + " iterations \n") k_best.append(second_best) while j < k: next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best) k_best.append(next_best) if num_iter != -1: conv_rates[j] += 1 #k_best.append(next_best) sys.stderr.write(str(j+1) + " best converges in " + str(num_iter) + " iterations \n") else: sys.stderr.write(str(j+1) + "th best does not converge\n") #break j += 1 for best in k_best: print ' '.join(best) print for j in range(k): sys.stderr.write("convergence rate of " + str(j) + " best = " + str(conv_rates[j]*100/conv_rates[0]) + "% \n")
def execute(dataset, hmm_file, tag_file): hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file) ts, test_tags = data_reader.read_tagging_data(dataset) test_sentences = replace_test(ts, hmm, tagset) i = 0 conv = 0 sec_conv = 0 for sentence in test_sentences: if True: #len(sentence) < 15: i += 1 truetags = test_tags[test_sentences.index(sentence)] sys.stderr.write('\n' + str(i)+ '\n') sys.stderr.write(' '.join(sentence) + "\n") print ' '.join(sentence) k_best = [] best_tags, num_iter, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm) if tags2 == best_tags: sys.stderr.write("YOU ARE WRONG!\n") if num_iter != -1: sec_conv += 1 sys.stderr.write("2nd best converges in " + str(num_iter) + "\n") k_best.append(best_tags) k_best.append(tags1) third_best, num_iter2 = dd_k_best.run(sentence, tagset, hmm, k_best) if num_iter2 != -1: sys.stderr.write("3rd best converges in " + str(num_iter2) + "\n") conv += 1 k_best.append(third_best) fourth_best, num_iter3 = dd_k_best.run(sentence, tagset, hmm, k_best) sys.stderr.write("4th best converges in " + str(num_iter3) + "\n") print ' '.join(best_tags) print ' '.join(tags2) print ' '.join(third_best) print ' '.join(fourth_best) else: sys.stderr.write( "3rd best does not converge :(\n") else: sys.stderr.write("2nd best does not converge :(\n") continue print sys.stderr.write("% convergence of 2nd best =" + str(sec_conv*100/i) + "\n") sys.stderr.write("% convergence of 3rd best =" + str(conv*100/sec_conv) + "\n")
def execute(dataset, hmm_file, tag_file): sys.stderr.write("loading learnt parameters...\n") hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file) sys.stderr.write("reading dev data...\n") test_sentences, test_tags = data_reader.read_tagging_data(dataset) test_sentences = replace_test(test_sentences, hmm, tagset) i = 0 converges = 0 avg_iterations = 0 start_time = time.time() for sentence in test_sentences: if len(sentence) > 0 :#True: #len(tree) < 100: i += 1 truetags = test_tags[test_sentences.index(sentence)] sys.stderr.write('\n' + str(i)+ '\n') tagprint(test_sents_not_rare[test_sentences.index(sentence)]) best_tags, num_iterations, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm) tagprint(best_tags) tagprint(tags2) if num_iterations != -1: sys.stderr.write("fst tagger accuracy = " + str(evaluate.accuracy(truetags, tags2)) + "\n") sys.stderr.write("best tags accuracy = " + str(evaluate.accuracy(truetags, best_tags)) + "\n") sys.stderr.write("converges in " + str(num_iterations) + " iterations \n") converges += 1 avg_iterations += num_iterations else: print sys.stderr.write("does not converge :(\n") tagprint(truetags) print #if i==100: #break sys.stderr.write("\n" + str(avg_iterations/converges) + " iterations on average\n") sys.stderr.write(str(converges*100/i) + " % convergence\n") sys.stderr.write("time_taken = "+ str(time.time() - start_time) + "\n")
def execute(dataset, hmm_file, tag_file): # sys.stderr.write("loading learnt parameters...\n") hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file) # sys.stderr.write("reading dev data...\n") ts, test_tags = data_reader.read_tagging_data(dataset) test_sentences = replace_test(ts, hmm, tagset) i = 0 converges = 0 avg_iterations = 0 start_time = time.time() fst_acc = 0 best_acc = 0 wrong = 0 for sentence in test_sentences: if True: i += 1 truetags = test_tags[test_sentences.index(sentence)] sys.stderr.write("\n" + str(i) + "\n") # sys.stderr.write(' '.join(ts[i-1]) + "\n") print " ".join(sentence) # print ' '.join(ts[i-1]) best_tags, num_iterations, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm) if tags2 == best_tags: sys.stderr.write("YOU ARE WRONG!\n") wrong += 1 if num_iterations != -1: facc = evaluate.accuracy(truetags, tags2) # sys.stderr.write("fst tagger accuracy = " + str(facc) + "\n") fst_acc += facc bacc = evaluate.accuracy(truetags, best_tags) # sys.stderr.write("best tags accuracy = " + str(bacc) + "\n") best_acc += bacc sys.stderr.write("converges in " + str(num_iterations) + " iterations \n") converges += 1 avg_iterations += num_iterations else: sys.stderr.write("does not converge :(\n") print " ".join(best_tags) print " ".join(tags2) # print "gold : ", ' '.join(truetags) print # if i == 100: # break sys.stderr.write("\nsystem performance\n--------------------\n") sys.stderr.write("\n" + str(wrong * 100 / converges) + "% sequences are wrong:\n") sys.stderr.write("\naverage accuracy of best: " + str(best_acc / converges) + "\n") sys.stderr.write("average accuracy of 2nd best: " + str(fst_acc / converges) + "\n") sys.stderr.write("\nsystem efficiency\n---------------------\n") sys.stderr.write("\n" + str(avg_iterations / converges) + " iterations on average\n") sys.stderr.write(str(converges * 100 / i) + " % convergence\n") sys.stderr.write("time_taken = " + str(time.time() - start_time) + "\n")