def main(): hmm_file = sys.argv[1] tag_file = sys.argv[2] hmm, tagset = get_param_tagset(hmm_file, tag_file) ts, test_tags = data_reader.read_tagging_data(sys.argv[3]) test_sentences = replace_test(ts, hmm, tagset) index = 0 for sentence in test_sentences: index += 1 print ' '.join(ts[index-1]) #sentence = "`` We would have to wait until we have collected on those assets before we can move forward , '' he said ." #sentence = replace_test([sentence.split(' ')], hmm, tagset)[0] #sys.stderr.write(str(index) + " " + ' '.join(sentence) + "\n") tagmap = filter_tagset(sentence, tagset, hmm) tagseqs = find_all_tagseqs(len(sentence), tagmap, "", []) resultmap = {} for tagline in tagseqs: tagseq = tagline.strip().split(' ') score = find_log_prob(hmm, tagset, sentence, tagseq) if score != '': key = ' '.join(tagseq) resultmap[key] = score sorted_x = sorted(resultmap.iteritems(), key=operator.itemgetter(1))[-10:] for k, v in sorted_x: print k, '\t', "{0:.2f}".format(v) print if index > 100: break
def execute(dataset, hmm_file, tag_file, k): hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file) ts, test_tags = data_reader.read_tagging_data(dataset) test_sentences = replace_test(ts, hmm, tagset) i = 0 conv_rates = [] # to keep track of the convergence rates varying with k for j in range(k): conv_rates.append(0.0) for sentence in test_sentences: k_best = [] if True: i += 1 truetags = test_tags[test_sentences.index(sentence)] sys.stderr.write('\n' + str(i)+ '\n') sys.stderr.write(' '.join(sentence) + "\n") print ' '.join(sentence) #TODO remove redundancy best_tags, num_iter, second_best, sb2 = dd_tagger_fst.run(sentence, tagset, hmm) conv_rates[0] += 1 k_best.append(best_tags) #next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best) if num_iter == -1: sys.stderr.write("2nd best does not converge :( \n") #print ' '.join(best_tags) #print #continue j = 2 # we have the best, and the second best now conv_rates[j-1] += 1 sys.stderr.write(str(j) + " best converges in " + str(num_iter) + " iterations \n") k_best.append(second_best) while j < k: next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best) k_best.append(next_best) if num_iter != -1: conv_rates[j] += 1 #k_best.append(next_best) sys.stderr.write(str(j+1) + " best converges in " + str(num_iter) + " iterations \n") else: sys.stderr.write(str(j+1) + "th best does not converge\n") #break j += 1 for best in k_best: print ' '.join(best) print for j in range(k): sys.stderr.write("convergence rate of " + str(j) + " best = " + str(conv_rates[j]*100/conv_rates[0]) + "% \n")
def execute(dataset, hmm_file, tag_file): hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file) ts, test_tags = data_reader.read_tagging_data(dataset) test_sentences = replace_test(ts, hmm, tagset) i = 0 conv = 0 sec_conv = 0 for sentence in test_sentences: if True: #len(sentence) < 15: i += 1 truetags = test_tags[test_sentences.index(sentence)] sys.stderr.write('\n' + str(i)+ '\n') sys.stderr.write(' '.join(sentence) + "\n") print ' '.join(sentence) k_best = [] best_tags, num_iter, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm) if tags2 == best_tags: sys.stderr.write("YOU ARE WRONG!\n") if num_iter != -1: sec_conv += 1 sys.stderr.write("2nd best converges in " + str(num_iter) + "\n") k_best.append(best_tags) k_best.append(tags1) third_best, num_iter2 = dd_k_best.run(sentence, tagset, hmm, k_best) if num_iter2 != -1: sys.stderr.write("3rd best converges in " + str(num_iter2) + "\n") conv += 1 k_best.append(third_best) fourth_best, num_iter3 = dd_k_best.run(sentence, tagset, hmm, k_best) sys.stderr.write("4th best converges in " + str(num_iter3) + "\n") print ' '.join(best_tags) print ' '.join(tags2) print ' '.join(third_best) print ' '.join(fourth_best) else: sys.stderr.write( "3rd best does not converge :(\n") else: sys.stderr.write("2nd best does not converge :(\n") continue print sys.stderr.write("% convergence of 2nd best =" + str(sec_conv*100/i) + "\n") sys.stderr.write("% convergence of 3rd best =" + str(conv*100/sec_conv) + "\n")
def main(): hmm_file = sys.argv[1] tag_file = sys.argv[2] hmm, tagset = get_param_tagset(hmm_file, tag_file) sentence = "We 're about to see if advertising works ." sentence = replace_test([sentence.split(' ')], hmm, tagset)[0] sys.stderr.write(' '.join(sentence) + "\n") tagmap = filter_tagset(sentence, tagset, hmm) tagseqs = find_all_tagseqs(len(sentence), tagmap, "", []) resultmap = {} for tagline in tagseqs: tagseq = tagline.strip().split(' ') score = find_log_prob(hmm, tagset, sentence, tagseq) if score != '': key = ' '.join(tagseq) resultmap[key] = score sorted_x = sorted(resultmap.iteritems(), key=operator.itemgetter(1)) for k, v in sorted_x: print k,"{0:.2f}".format(v)
def execute(dataset, hmm_file, tag_file): # sys.stderr.write("loading learnt parameters...\n") hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file) # sys.stderr.write("reading dev data...\n") ts, test_tags = data_reader.read_tagging_data(dataset) test_sentences = replace_test(ts, hmm, tagset) i = 0 converges = 0 avg_iterations = 0 start_time = time.time() fst_acc = 0 best_acc = 0 wrong = 0 for sentence in test_sentences: if True: i += 1 truetags = test_tags[test_sentences.index(sentence)] sys.stderr.write("\n" + str(i) + "\n") # sys.stderr.write(' '.join(ts[i-1]) + "\n") print " ".join(sentence) # print ' '.join(ts[i-1]) best_tags, num_iterations, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm) if tags2 == best_tags: sys.stderr.write("YOU ARE WRONG!\n") wrong += 1 if num_iterations != -1: facc = evaluate.accuracy(truetags, tags2) # sys.stderr.write("fst tagger accuracy = " + str(facc) + "\n") fst_acc += facc bacc = evaluate.accuracy(truetags, best_tags) # sys.stderr.write("best tags accuracy = " + str(bacc) + "\n") best_acc += bacc sys.stderr.write("converges in " + str(num_iterations) + " iterations \n") converges += 1 avg_iterations += num_iterations else: sys.stderr.write("does not converge :(\n") print " ".join(best_tags) print " ".join(tags2) # print "gold : ", ' '.join(truetags) print # if i == 100: # break sys.stderr.write("\nsystem performance\n--------------------\n") sys.stderr.write("\n" + str(wrong * 100 / converges) + "% sequences are wrong:\n") sys.stderr.write("\naverage accuracy of best: " + str(best_acc / converges) + "\n") sys.stderr.write("average accuracy of 2nd best: " + str(fst_acc / converges) + "\n") sys.stderr.write("\nsystem efficiency\n---------------------\n") sys.stderr.write("\n" + str(avg_iterations / converges) + " iterations on average\n") sys.stderr.write(str(converges * 100 / i) + " % convergence\n") sys.stderr.write("time_taken = " + str(time.time() - start_time) + "\n")
word = sent[i] score += get_local_score(word, prev, tag, hmm) prev = tag score += get_local_score("", prev, "STOP", hmm) return score def get_aug_hmm(seq, sent, hmm, dd_u): score = get_hmm_only_score(seq, sent, hmm) for i in range(len(seq)): score -= dd_u[i][seq[i]] return score if __name__ == "__main__": sentences, truetags = data_reader.read_tagging_data(sys.argv[1]) hmm, tagset = hmm_utils.get_param_tagset(sys.argv[2], sys.argv[3]) sentences = replace_test(sentences, hmm, tagset) print sentences i = 0 tot_acc = 0.0 for sentence in sentences: # for tag in tagset: # print tag,"\t", # print tags = run(sentence, tagset, hmm, None) #tot_acc += evaluate.accuracy(truetags, tags) print sentence print tags, evaluate.accuracy(truetags[i], tags) print truetags[i], " :gold" i+=1 print "---------------------------"